build: move search logic to subdir (#5298)
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
if has some stuff pretty specific to fsfe.org in there, and hence should not be in global build Co-authored-by: Darragh Elliott <me@delliott.net> Reviewed-on: #5298 Co-authored-by: delliott <delliott@fsfe.org> Co-committed-by: delliott <delliott@fsfe.org>
This commit was merged in pull request #5298.
This commit is contained in:
@@ -14,7 +14,6 @@ import logging
|
||||
import multiprocessing.pool
|
||||
from pathlib import Path
|
||||
|
||||
from .index_website import index_websites
|
||||
from .prepare_subdirectories import prepare_subdirectories
|
||||
from .update_css import update_css
|
||||
from .update_defaultxsls import update_defaultxsls
|
||||
@@ -36,14 +35,6 @@ def phase1_run(
|
||||
"""
|
||||
logger.info("Starting Phase 1 - Setup")
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Build search index
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
# This step runs a Python tool that creates an index of all news and
|
||||
# articles. It extracts titles, teaser, tags, dates and potentially more.
|
||||
# The result will be fed into a JS file.
|
||||
index_websites(source_dir, languages, pool)
|
||||
# -----------------------------------------------------------------------------
|
||||
# Update CSS files
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
6
fsfe.org/search/__init__.py
Normal file
6
fsfe.org/search/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# SPDX-FileCopyrightText: Free Software Foundation Europe e.V. <https://fsfe.org>
|
||||
#
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
# __init__.py is a special Python file that allows a directory to become
|
||||
# a Python package so it can be accessed using the 'import' statement.
|
||||
@@ -2,8 +2,6 @@
|
||||
#
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
# Build an index for the search engine based on the article titles and tags
|
||||
|
||||
import json
|
||||
import logging
|
||||
import multiprocessing.pool
|
||||
@@ -11,11 +9,10 @@ from pathlib import Path
|
||||
|
||||
import iso639
|
||||
import nltk
|
||||
from fsfe_website_build.lib.misc import update_if_changed
|
||||
from lxml import etree
|
||||
from nltk.corpus import stopwords as nltk_stopwords
|
||||
|
||||
from fsfe_website_build.lib.misc import update_if_changed
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -40,7 +37,6 @@ def _process_file(file: Path, stopwords: set[str]) -> dict:
|
||||
"""
|
||||
Generate the search index entry for a given file and set of stopwords
|
||||
"""
|
||||
logger.debug("Processing file %s", file)
|
||||
xslt_root = etree.parse(file)
|
||||
tags = (
|
||||
tag.get("key")
|
||||
@@ -71,21 +67,18 @@ def _process_file(file: Path, stopwords: set[str]) -> dict:
|
||||
}
|
||||
|
||||
|
||||
def index_websites(
|
||||
source_dir: Path,
|
||||
languages: list[str],
|
||||
pool: multiprocessing.pool.Pool,
|
||||
) -> None:
|
||||
def run(languages: list[str], processes: int, working_dir: Path) -> None:
|
||||
"""
|
||||
Generate a search index for all sites that have a search/search.js file
|
||||
This step runs a Python tool that creates an index of all news and
|
||||
articles. It extracts titles, teaser, tags, dates and potentially more.
|
||||
The result will be fed into a JS file.
|
||||
"""
|
||||
logger.info("Creating search indexes")
|
||||
# Download all stopwords
|
||||
nltkdir = "./.nltk_data"
|
||||
source_dir = working_dir.parent
|
||||
nltk.data.path = [nltkdir, *nltk.data.path]
|
||||
nltk.download("stopwords", download_dir=nltkdir, quiet=True)
|
||||
# Iterate over sites
|
||||
if source_dir.joinpath("search/search.js").exists():
|
||||
with multiprocessing.Pool(processes) as pool:
|
||||
logger.debug("Indexing %s", source_dir)
|
||||
|
||||
# Get all xhtml files in languages to be processed
|
||||
@@ -122,6 +115,6 @@ def index_websites(
|
||||
articles = pool.starmap(_process_file, files_with_stopwords)
|
||||
|
||||
update_if_changed(
|
||||
source_dir.joinpath("search/index.js"),
|
||||
working_dir.joinpath("index.js"),
|
||||
"var pages = " + json.dumps(articles, ensure_ascii=False),
|
||||
)
|
||||
Reference in New Issue
Block a user