refactor: transform processor xsls only once (#5397)
All checks were successful
continuous-integration/drone/push Build is passing

# About
## Summary
Before we iterated over xhtml files by directory, and rss/ics ones by processor (xsl stylesheet).

This worked, but meant we had to pass the processor filepath to the processing functions, which meant we were reprocessing the xsl several times, probably hundred for some xsl's, like the default one for a site.

We now instead iterate by processor, pre parsing it before passing it to the processing code.

This gets us a significant speed increase (~30%), and makes the code a little cleaner.

# Benchmarks
## Before
```
Benchmark 1: uv run build --full
  Time (mean ± σ):     269.798 s ± 12.501 s    [User: 1566.724 s, System: 75.804 s]
  Range (min … max):   262.183 s … 284.226 s    3 runs
```
## After
```
Benchmark 1: uv run build --full
  Time (mean ± σ):     206.330 s ±  5.088 s    [User: 498.556 s, System: 47.233 s]
  Range (min … max):   202.760 s … 212.155 s    3 runs
```

Co-authored-by: Darragh Elliott <me@delliott.net>
Co-authored-by: tobiasd <tobiasd@fsfe.org>
Reviewed-on: #5397
Co-authored-by: delliott <delliott@fsfe.org>
Co-committed-by: delliott <delliott@fsfe.org>
This commit is contained in:
2025-10-13 11:24:25 +00:00
committed by tobiasd
parent 1673f97097
commit 5551375ac8
3 changed files with 103 additions and 95 deletions

View File

@@ -185,7 +185,7 @@ def _build_xmlstream(infile: Path, parser: etree.XMLParser) -> etree.Element:
return page
def process_file(infile: Path, processor: Path) -> etree._XSLTResultTree:
def process_file(infile: Path, transform: etree.XSLT) -> etree._XSLTResultTree:
"""
Process a given file using the correct xsl sheet
"""
@@ -193,8 +193,6 @@ def process_file(infile: Path, processor: Path) -> etree._XSLTResultTree:
lang = lang_from_filename(infile)
parser = etree.XMLParser(remove_blank_text=True, remove_comments=True)
xmlstream = _build_xmlstream(infile, parser)
xslt_tree = etree.parse(processor.resolve(), parser)
transform = etree.XSLT(xslt_tree)
result = transform(xmlstream)
# And now a bunch of regexes to fix some links.
# xx is the language code in all comments

View File

@@ -4,86 +4,68 @@
import logging
import multiprocessing.pool
from itertools import product
from pathlib import Path
from lxml import etree
from fsfe_website_build.lib.misc import get_basepath
from fsfe_website_build.lib.process_file import process_file
logger = logging.getLogger(__name__)
def _run_process(
target_file: Path,
processor: Path,
source_file: Path,
basename: Path,
lang: str,
) -> None:
# if the target file does not exist, we make it
if not target_file.exists() or any(
# If any source file is newer than the file to be generated
# we recreate the generated file
# if the source file does not exist, ignore it.
(
(file.exists() and file.stat().st_mtime > target_file.stat().st_mtime)
for file in [
(
source_file
if source_file.exists()
else basename.with_suffix(".en.xhtml")
),
processor,
(
source_file.parent.joinpath("." + basename.name).with_suffix(
".xmllist",
)
),
Path(f"global/data/texts/.texts.{lang}.xml"),
Path(f"global/data/topbanner/.topbanner.{lang}.xml"),
Path("global/data/texts/texts.en.xml"),
]
),
):
logger.debug("Building %s", target_file)
result = process_file(source_file, processor)
target_file.parent.mkdir(parents=True, exist_ok=True)
result.write_output(target_file)
def _process_dir(
source_dir: Path,
languages: list[str],
target: Path,
directory: Path,
) -> None:
for basename in {path.with_suffix("") for path in directory.glob("*.??.xhtml")}:
for lang in languages:
source_file = basename.with_suffix(f".{lang}.xhtml")
target_file = target.joinpath(
source_file.relative_to(source_dir),
).with_suffix(".html")
processor = (
basename.with_suffix(".xsl")
if basename.with_suffix(".xsl").exists()
else basename.parent.joinpath(".default.xsl")
)
_run_process(target_file, processor, source_file, basename, lang)
def _process_stylesheet(
def _process_set(
source_dir: Path,
languages: list[str],
target: Path,
processor: Path,
files: set[Path],
) -> None:
basename = get_basepath(processor)
destination_base = target.joinpath(basename.relative_to(source_dir))
for lang in languages:
target_file = destination_base.with_suffix(
f".{lang}{processor.with_suffix('').suffix}",
# generate the transform func
# doing it here should mean that we use only one per thread,
# and also process each one only once
# Max memory and performance efficacy
parser = etree.XMLParser(remove_blank_text=True, remove_comments=True)
xslt_tree = etree.parse(processor.resolve(), parser)
transform = etree.XSLT(xslt_tree)
for basepath, lang in product(files, languages):
source_file = Path(str(basepath) + f".{lang}.xhtml")
target_suffix = (
".html" if (len(processor.suffixes) == 1) else processor.suffixes[0]
)
source_file = basename.with_suffix(f".{lang}.xhtml")
_run_process(target_file, processor, source_file, basename, lang)
target_file = target.joinpath(
source_file.relative_to(source_dir),
).with_suffix(target_suffix)
# if the target file does not exist, we make it
if not target_file.exists() or any(
# If any source file is newer than the file to be generated
# we recreate the generated file
# if the source file does not exist, ignore it.
(
(file.exists() and file.stat().st_mtime > target_file.stat().st_mtime)
for file in [
(
source_file
if source_file.exists()
else Path(str(basepath) + ".en.xhtml")
),
processor,
(
source_file.parent.joinpath("." + basepath.name).with_suffix(
".xmllist",
)
),
Path(f"global/data/texts/.texts.{lang}.xml"),
Path(f"global/data/topbanner/.topbanner.{lang}.xml"),
Path("global/data/texts/texts.en.xml"),
]
),
):
logger.debug("Building %s", target_file)
result = process_file(source_file, transform)
target_file.parent.mkdir(parents=True, exist_ok=True)
result.write_output(target_file)
def process_files(
@@ -96,29 +78,54 @@ def process_files(
Build .html, .rss and .ics files from .xhtml sources
"""
# TODO for performance it would be better to iterate by processor xls,
# and parse it only once and pass the xsl object to called function.
logger.info("Processing xhtml files")
logger.info("Processing xhtml, rss, ics files")
# generate a set of unique processing xsls
xsl_files = {
processor.resolve().relative_to(source_dir.parent.resolve())
for processor in source_dir.glob("**/*.xsl")
}
process_files_dict = {}
for processor in xsl_files:
process_files_dict[processor] = set()
# This gathers all the simple xhtml files for generating xhtml output
for file in source_dir.glob("**/*.*.xhtml"):
# Processors with a file ending for the output encoded in the name, eg
# events.rss.xsl
type_specific_processors = set(
file.parent.glob(f"{get_basepath(file).name}.*.xsl")
)
# The form that a named processor for a file would be
# Essentially, if there is no output name, we default to html
# <basename>.html.xsl should be equal in effect to <basename>.xsl
xhtml_named_processor = (
file.with_suffix("")
.with_suffix(".xsl")
.resolve()
.relative_to(source_dir.parent.resolve())
)
# if that does not exist, default to
xhtml_default_processor = (
file.parent.joinpath(".default.xsl")
.resolve()
.relative_to(source_dir.parent.resolve())
)
xhtml_processor = (
xhtml_named_processor
if xhtml_named_processor.exists()
else xhtml_default_processor
)
# And add the files/processors we should be using for every file
for processor in type_specific_processors | {xhtml_processor}:
# If we are using a type specific processor, there will be more suffixes
file_to_add = processor if len(processor.suffixes) > 1 else file
process_files_dict[processor].add(get_basepath(file_to_add))
pool.starmap(
_process_dir,
_process_set,
(
(source_dir, languages, target, directory)
for directory in {path.parent for path in source_dir.glob("**/*.*.xhtml")}
),
)
logger.info("Processing rss files")
pool.starmap(
_process_stylesheet,
(
(source_dir, languages, target, processor)
for processor in source_dir.glob("**/*.rss.xsl")
),
)
logger.info("Processing ics files")
pool.starmap(
_process_stylesheet,
(
(source_dir, languages, target, processor)
for processor in source_dir.glob("**/*.ics.xsl")
(source_dir, languages, target, processor, files)
for processor, files in process_files_dict.items()
),
)

View File

@@ -7,7 +7,7 @@ from lxml import etree
@pytest.fixture
def sample_xsl(tmp_path: Path) -> Path:
def sample_xsl_transformer(tmp_path: Path) -> etree.XSLT:
"""Minimal XSLT that just copies the input through."""
xsl_path = tmp_path / "sample.xsl"
xsl_path.write_text(
@@ -26,7 +26,9 @@ def sample_xsl(tmp_path: Path) -> Path:
""",
).strip(),
)
return xsl_path
parser = etree.XMLParser(remove_blank_text=True, remove_comments=True)
xslt_tree = etree.parse(xsl_path.resolve(), parser)
return etree.XSLT(xslt_tree)
@pytest.mark.parametrize(
@@ -44,7 +46,7 @@ def sample_xsl(tmp_path: Path) -> Path:
)
def process_file_link_rewrites_test(
tmp_path: Path,
sample_xsl: Path,
sample_xsl_transformer: etree.XSLT,
lang: str,
link_in: str,
link_out: str,
@@ -61,8 +63,9 @@ def process_file_link_rewrites_test(
""",
).strip(),
)
assert isinstance(sample_xsl_transformer, etree.XSLT)
result_doc = process_file(xml_path, sample_xsl)
result_doc = process_file(xml_path, sample_xsl_transformer)
assert isinstance(result_doc, etree._XSLTResultTree)
# We get a list, but as we have only one link in the above sample
# we only need to care about the first one