From 02eaf288bf616afb1b3d2d8ce2292497b99210d7 Mon Sep 17 00:00:00 2001 From: "CanbiZ (MickLesk)" <47820557+MickLesk@users.noreply.github.com> Date: Fri, 8 May 2026 15:39:20 +0200 Subject: [PATCH] tools.func: add setup_nltk as new function (#14314) --- ct/mealie.sh | 6 +-- ct/paperless-ngx.sh | 8 +--- install/kitchenowl-install.sh | 3 +- install/mealie-install.sh | 6 +-- install/paperless-ngx-install.sh | 8 +--- misc/tools.func | 78 ++++++++++++++++++++++++++++++-- 6 files changed, 79 insertions(+), 30 deletions(-) diff --git a/ct/mealie.sh b/ct/mealie.sh index 89312ddc1..92f9db888 100644 --- a/ct/mealie.sh +++ b/ct/mealie.sh @@ -81,11 +81,7 @@ STARTEOF cp -r /opt/mealie/frontend/dist/* /opt/mealie/mealie/frontend/ msg_ok "Copied Frontend" - msg_info "Updating NLTK Data" - mkdir -p /nltk_data/ - cd /opt/mealie - $STD uv run python -m nltk.downloader -d /nltk_data averaged_perceptron_tagger_eng - msg_ok "Updated NLTK Data" + setup_nltk "averaged_perceptron_tagger_eng" "/nltk_data" msg_info "Starting Service" systemctl start mealie diff --git a/ct/paperless-ngx.sh b/ct/paperless-ngx.sh index e511669aa..271932a37 100644 --- a/ct/paperless-ngx.sh +++ b/ct/paperless-ngx.sh @@ -164,13 +164,7 @@ function update_script() { fi fi - msg_info "Updating NLTK Data" - cd /opt/paperless - $STD uv run python -m nltk.downloader -d /usr/share/nltk_data snowball_data - $STD uv run python -m nltk.downloader -d /usr/share/nltk_data stopwords - $STD uv run python -m nltk.downloader -d /usr/share/nltk_data punkt_tab || - $STD uv run python -m nltk.downloader -d /usr/share/nltk_data punkt - msg_ok "Updated NLTK Data" + setup_nltk "snowball_data stopwords punkt_tab" "/usr/share/nltk_data" msg_info "Starting all Paperless-ngx Services" systemctl start paperless-consumer paperless-webserver paperless-scheduler paperless-task-queue diff --git a/install/kitchenowl-install.sh b/install/kitchenowl-install.sh index 590530a08..e2cea6cfa 100644 --- a/install/kitchenowl-install.sh +++ b/install/kitchenowl-install.sh @@ -47,8 +47,7 @@ msg_info "Setting up KitchenOwl" cd /opt/kitchenowl/backend $STD uv sync --no-dev sed -i 's/default=True/default=False/' /opt/kitchenowl/backend/wsgi.py -mkdir -p /nltk_data -$STD uv run python -m nltk.downloader -d /nltk_data averaged_perceptron_tagger_eng +setup_nltk "averaged_perceptron_tagger_eng" "/nltk_data" JWT_SECRET=$(openssl rand -hex 32) mkdir -p /opt/kitchenowl/data cat </opt/kitchenowl/kitchenowl.env diff --git a/install/mealie-install.sh b/install/mealie-install.sh index 1a821d1cb..f29ea36b4 100644 --- a/install/mealie-install.sh +++ b/install/mealie-install.sh @@ -55,11 +55,7 @@ mkdir -p /opt/mealie/mealie/frontend cp -r /opt/mealie/frontend/dist/* /opt/mealie/mealie/frontend/ msg_ok "Copied Frontend" -msg_info "Downloading NLTK Data" -mkdir -p /nltk_data/ -cd /opt/mealie -$STD uv run python -m nltk.downloader -d /nltk_data averaged_perceptron_tagger_eng -msg_ok "Downloaded NLTK Data" +setup_nltk "averaged_perceptron_tagger_eng" "/nltk_data" msg_info "Writing Environment File" SECRET=$(openssl rand -hex 32) diff --git a/install/paperless-ngx-install.sh b/install/paperless-ngx-install.sh index 2b4206251..b7d1745a4 100644 --- a/install/paperless-ngx-install.sh +++ b/install/paperless-ngx-install.sh @@ -94,18 +94,12 @@ user.save() EOF msg_ok "Set up admin Paperless-ngx User & Password" -msg_info "Installing Natural Language Toolkit (Patience)" -cd /opt/paperless -$STD uv run python -m nltk.downloader -d /usr/share/nltk_data snowball_data -$STD uv run python -m nltk.downloader -d /usr/share/nltk_data stopwords -$STD uv run python -m nltk.downloader -d /usr/share/nltk_data punkt_tab || - $STD uv run python -m nltk.downloader -d /usr/share/nltk_data punkt +setup_nltk "snowball_data stopwords punkt_tab" "/usr/share/nltk_data" for policy_file in /etc/ImageMagick-6/policy.xml /etc/ImageMagick-7/policy.xml; do if [[ -f "$policy_file" ]]; then sed -i -e 's/rights="none" pattern="PDF"/rights="read|write" pattern="PDF"/' "$policy_file" fi done -msg_ok "Installed Natural Language Toolkit" msg_info "Creating Services" cat </etc/systemd/system/paperless-scheduler.service diff --git a/misc/tools.func b/misc/tools.func index 3ef5c84a6..dd01b4619 100644 --- a/misc/tools.func +++ b/misc/tools.func @@ -2095,10 +2095,10 @@ get_latest_gh_tag() { local count count=$(jq 'length' "$temp_file" 2>/dev/null || echo 0) if [[ "$count" -gt 0 ]]; then - tag=$(jq -r '.[].ref' "$temp_file" \ - | sed 's|^refs/tags/||' \ - | sort -V \ - | tail -n1) + tag=$(jq -r '.[].ref' "$temp_file" | + sed 's|^refs/tags/||' | + sort -V | + tail -n1) fi else # No prefix: just take the first (newest) tag from /tags @@ -9439,3 +9439,73 @@ function fetch_and_deploy_gl_release() { msg_ok "Deployed: $app ($version)" rm -rf "$tmpdir" } + +# ------------------------------------------------------------------------------ +# Download NLTK data packages directly from GitHub, bypassing Python. +# Avoids CPU-instruction failures (SIGILL) on older hardware lacking AVX. +# +# Usage: +# setup_nltk "averaged_perceptron_tagger_eng" "/nltk_data" +# setup_nltk "snowball_data stopwords punkt_tab" "/usr/share/nltk_data" +# +# Parameters: +# $1 - Space-separated list of NLTK package IDs +# $2 - Target directory (default: /usr/share/nltk_data) +# +# Returns: 0 on success, non-zero if any package failed +# ------------------------------------------------------------------------------ +function setup_nltk() { + local packages="${1:?setup_nltk requires at least one package name}" + local target_dir="${2:-/usr/share/nltk_data}" + local NLTK_INDEX_URL="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml" + local index_xml rc=0 + + ensure_dependencies unzip + + index_xml=$(curl_with_retry "$NLTK_INDEX_URL" "-") || { + msg_error "Failed to fetch NLTK package index" + return 1 + } + + local pkg + for pkg in $packages; do + msg_info "Downloading NLTK: $pkg" + local pkg_line subdir pkg_url do_unzip tmp_zip + + pkg_line=$(echo "$index_xml" | grep "id=\"${pkg}\"" | head -1) + if [[ -z "$pkg_line" ]]; then + msg_error "NLTK package not found in index: $pkg" + rc=1 + continue + fi + + subdir=$(echo "$pkg_line" | grep -oP 'subdir="\K[^"]+') + pkg_url=$(echo "$pkg_line" | grep -oP 'url="\K[^"]+') + do_unzip=$(echo "$pkg_line" | grep -oP 'unzip="\K[^"]+') + + if [[ -z "$subdir" || -z "$pkg_url" ]]; then + msg_error "Could not parse NLTK index entry for: $pkg" + rc=1 + continue + fi + + mkdir -p "${target_dir}/${subdir}" + tmp_zip=$(mktemp --suffix=.zip) + + if CURL_TIMEOUT=120 curl_with_retry "$pkg_url" "$tmp_zip"; then + if [[ "$do_unzip" == "1" ]]; then + $STD unzip -q -o "$tmp_zip" -d "${target_dir}/${subdir}/" + rm -f "$tmp_zip" + else + mv "$tmp_zip" "${target_dir}/${subdir}/${pkg}.zip" + fi + msg_ok "Downloaded NLTK: $pkg" + else + msg_error "Failed to download NLTK package: $pkg" + rm -f "$tmp_zip" + rc=1 + fi + done + + return $rc +}