tools.func: add setup_nltk as new function (#14314)

This commit is contained in:
CanbiZ (MickLesk)
2026-05-08 15:39:20 +02:00
committed by GitHub
parent 24fbf24c6d
commit 02eaf288bf
6 changed files with 79 additions and 30 deletions

View File

@@ -81,11 +81,7 @@ STARTEOF
cp -r /opt/mealie/frontend/dist/* /opt/mealie/mealie/frontend/
msg_ok "Copied Frontend"
msg_info "Updating NLTK Data"
mkdir -p /nltk_data/
cd /opt/mealie
$STD uv run python -m nltk.downloader -d /nltk_data averaged_perceptron_tagger_eng
msg_ok "Updated NLTK Data"
setup_nltk "averaged_perceptron_tagger_eng" "/nltk_data"
msg_info "Starting Service"
systemctl start mealie

View File

@@ -164,13 +164,7 @@ function update_script() {
fi
fi
msg_info "Updating NLTK Data"
cd /opt/paperless
$STD uv run python -m nltk.downloader -d /usr/share/nltk_data snowball_data
$STD uv run python -m nltk.downloader -d /usr/share/nltk_data stopwords
$STD uv run python -m nltk.downloader -d /usr/share/nltk_data punkt_tab ||
$STD uv run python -m nltk.downloader -d /usr/share/nltk_data punkt
msg_ok "Updated NLTK Data"
setup_nltk "snowball_data stopwords punkt_tab" "/usr/share/nltk_data"
msg_info "Starting all Paperless-ngx Services"
systemctl start paperless-consumer paperless-webserver paperless-scheduler paperless-task-queue

View File

@@ -47,8 +47,7 @@ msg_info "Setting up KitchenOwl"
cd /opt/kitchenowl/backend
$STD uv sync --no-dev
sed -i 's/default=True/default=False/' /opt/kitchenowl/backend/wsgi.py
mkdir -p /nltk_data
$STD uv run python -m nltk.downloader -d /nltk_data averaged_perceptron_tagger_eng
setup_nltk "averaged_perceptron_tagger_eng" "/nltk_data"
JWT_SECRET=$(openssl rand -hex 32)
mkdir -p /opt/kitchenowl/data
cat <<EOF >/opt/kitchenowl/kitchenowl.env

View File

@@ -55,11 +55,7 @@ mkdir -p /opt/mealie/mealie/frontend
cp -r /opt/mealie/frontend/dist/* /opt/mealie/mealie/frontend/
msg_ok "Copied Frontend"
msg_info "Downloading NLTK Data"
mkdir -p /nltk_data/
cd /opt/mealie
$STD uv run python -m nltk.downloader -d /nltk_data averaged_perceptron_tagger_eng
msg_ok "Downloaded NLTK Data"
setup_nltk "averaged_perceptron_tagger_eng" "/nltk_data"
msg_info "Writing Environment File"
SECRET=$(openssl rand -hex 32)

View File

@@ -94,18 +94,12 @@ user.save()
EOF
msg_ok "Set up admin Paperless-ngx User & Password"
msg_info "Installing Natural Language Toolkit (Patience)"
cd /opt/paperless
$STD uv run python -m nltk.downloader -d /usr/share/nltk_data snowball_data
$STD uv run python -m nltk.downloader -d /usr/share/nltk_data stopwords
$STD uv run python -m nltk.downloader -d /usr/share/nltk_data punkt_tab ||
$STD uv run python -m nltk.downloader -d /usr/share/nltk_data punkt
setup_nltk "snowball_data stopwords punkt_tab" "/usr/share/nltk_data"
for policy_file in /etc/ImageMagick-6/policy.xml /etc/ImageMagick-7/policy.xml; do
if [[ -f "$policy_file" ]]; then
sed -i -e 's/rights="none" pattern="PDF"/rights="read|write" pattern="PDF"/' "$policy_file"
fi
done
msg_ok "Installed Natural Language Toolkit"
msg_info "Creating Services"
cat <<EOF >/etc/systemd/system/paperless-scheduler.service

View File

@@ -2095,10 +2095,10 @@ get_latest_gh_tag() {
local count
count=$(jq 'length' "$temp_file" 2>/dev/null || echo 0)
if [[ "$count" -gt 0 ]]; then
tag=$(jq -r '.[].ref' "$temp_file" \
| sed 's|^refs/tags/||' \
| sort -V \
| tail -n1)
tag=$(jq -r '.[].ref' "$temp_file" |
sed 's|^refs/tags/||' |
sort -V |
tail -n1)
fi
else
# No prefix: just take the first (newest) tag from /tags
@@ -9439,3 +9439,73 @@ function fetch_and_deploy_gl_release() {
msg_ok "Deployed: $app ($version)"
rm -rf "$tmpdir"
}
# ------------------------------------------------------------------------------
# Download NLTK data packages directly from GitHub, bypassing Python.
# Avoids CPU-instruction failures (SIGILL) on older hardware lacking AVX.
#
# Usage:
# setup_nltk "averaged_perceptron_tagger_eng" "/nltk_data"
# setup_nltk "snowball_data stopwords punkt_tab" "/usr/share/nltk_data"
#
# Parameters:
# $1 - Space-separated list of NLTK package IDs
# $2 - Target directory (default: /usr/share/nltk_data)
#
# Returns: 0 on success, non-zero if any package failed
# ------------------------------------------------------------------------------
function setup_nltk() {
local packages="${1:?setup_nltk requires at least one package name}"
local target_dir="${2:-/usr/share/nltk_data}"
local NLTK_INDEX_URL="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml"
local index_xml rc=0
ensure_dependencies unzip
index_xml=$(curl_with_retry "$NLTK_INDEX_URL" "-") || {
msg_error "Failed to fetch NLTK package index"
return 1
}
local pkg
for pkg in $packages; do
msg_info "Downloading NLTK: $pkg"
local pkg_line subdir pkg_url do_unzip tmp_zip
pkg_line=$(echo "$index_xml" | grep "id=\"${pkg}\"" | head -1)
if [[ -z "$pkg_line" ]]; then
msg_error "NLTK package not found in index: $pkg"
rc=1
continue
fi
subdir=$(echo "$pkg_line" | grep -oP 'subdir="\K[^"]+')
pkg_url=$(echo "$pkg_line" | grep -oP 'url="\K[^"]+')
do_unzip=$(echo "$pkg_line" | grep -oP 'unzip="\K[^"]+')
if [[ -z "$subdir" || -z "$pkg_url" ]]; then
msg_error "Could not parse NLTK index entry for: $pkg"
rc=1
continue
fi
mkdir -p "${target_dir}/${subdir}"
tmp_zip=$(mktemp --suffix=.zip)
if CURL_TIMEOUT=120 curl_with_retry "$pkg_url" "$tmp_zip"; then
if [[ "$do_unzip" == "1" ]]; then
$STD unzip -q -o "$tmp_zip" -d "${target_dir}/${subdir}/"
rm -f "$tmp_zip"
else
mv "$tmp_zip" "${target_dir}/${subdir}/${pkg}.zip"
fi
msg_ok "Downloaded NLTK: $pkg"
else
msg_error "Failed to download NLTK package: $pkg"
rm -f "$tmp_zip"
rc=1
fi
done
return $rc
}