refactor: transform processor xsls only once (#5397)
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
# About ## Summary Before we iterated over xhtml files by directory, and rss/ics ones by processor (xsl stylesheet). This worked, but meant we had to pass the processor filepath to the processing functions, which meant we were reprocessing the xsl several times, probably hundred for some xsl's, like the default one for a site. We now instead iterate by processor, pre parsing it before passing it to the processing code. This gets us a significant speed increase (~30%), and makes the code a little cleaner. # Benchmarks ## Before ``` Benchmark 1: uv run build --full Time (mean ± σ): 269.798 s ± 12.501 s [User: 1566.724 s, System: 75.804 s] Range (min … max): 262.183 s … 284.226 s 3 runs ``` ## After ``` Benchmark 1: uv run build --full Time (mean ± σ): 206.330 s ± 5.088 s [User: 498.556 s, System: 47.233 s] Range (min … max): 202.760 s … 212.155 s 3 runs ``` Co-authored-by: Darragh Elliott <me@delliott.net> Co-authored-by: tobiasd <tobiasd@fsfe.org> Reviewed-on: #5397 Co-authored-by: delliott <delliott@fsfe.org> Co-committed-by: delliott <delliott@fsfe.org>
This commit is contained in:
@@ -185,7 +185,7 @@ def _build_xmlstream(infile: Path, parser: etree.XMLParser) -> etree.Element:
|
||||
return page
|
||||
|
||||
|
||||
def process_file(infile: Path, processor: Path) -> etree._XSLTResultTree:
|
||||
def process_file(infile: Path, transform: etree.XSLT) -> etree._XSLTResultTree:
|
||||
"""
|
||||
Process a given file using the correct xsl sheet
|
||||
"""
|
||||
@@ -193,8 +193,6 @@ def process_file(infile: Path, processor: Path) -> etree._XSLTResultTree:
|
||||
lang = lang_from_filename(infile)
|
||||
parser = etree.XMLParser(remove_blank_text=True, remove_comments=True)
|
||||
xmlstream = _build_xmlstream(infile, parser)
|
||||
xslt_tree = etree.parse(processor.resolve(), parser)
|
||||
transform = etree.XSLT(xslt_tree)
|
||||
result = transform(xmlstream)
|
||||
# And now a bunch of regexes to fix some links.
|
||||
# xx is the language code in all comments
|
||||
|
||||
@@ -4,86 +4,68 @@
|
||||
|
||||
import logging
|
||||
import multiprocessing.pool
|
||||
from itertools import product
|
||||
from pathlib import Path
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from fsfe_website_build.lib.misc import get_basepath
|
||||
from fsfe_website_build.lib.process_file import process_file
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _run_process(
|
||||
target_file: Path,
|
||||
processor: Path,
|
||||
source_file: Path,
|
||||
basename: Path,
|
||||
lang: str,
|
||||
) -> None:
|
||||
# if the target file does not exist, we make it
|
||||
if not target_file.exists() or any(
|
||||
# If any source file is newer than the file to be generated
|
||||
# we recreate the generated file
|
||||
# if the source file does not exist, ignore it.
|
||||
(
|
||||
(file.exists() and file.stat().st_mtime > target_file.stat().st_mtime)
|
||||
for file in [
|
||||
(
|
||||
source_file
|
||||
if source_file.exists()
|
||||
else basename.with_suffix(".en.xhtml")
|
||||
),
|
||||
processor,
|
||||
(
|
||||
source_file.parent.joinpath("." + basename.name).with_suffix(
|
||||
".xmllist",
|
||||
)
|
||||
),
|
||||
Path(f"global/data/texts/.texts.{lang}.xml"),
|
||||
Path(f"global/data/topbanner/.topbanner.{lang}.xml"),
|
||||
Path("global/data/texts/texts.en.xml"),
|
||||
]
|
||||
),
|
||||
):
|
||||
logger.debug("Building %s", target_file)
|
||||
result = process_file(source_file, processor)
|
||||
target_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
result.write_output(target_file)
|
||||
|
||||
|
||||
def _process_dir(
|
||||
source_dir: Path,
|
||||
languages: list[str],
|
||||
target: Path,
|
||||
directory: Path,
|
||||
) -> None:
|
||||
for basename in {path.with_suffix("") for path in directory.glob("*.??.xhtml")}:
|
||||
for lang in languages:
|
||||
source_file = basename.with_suffix(f".{lang}.xhtml")
|
||||
target_file = target.joinpath(
|
||||
source_file.relative_to(source_dir),
|
||||
).with_suffix(".html")
|
||||
processor = (
|
||||
basename.with_suffix(".xsl")
|
||||
if basename.with_suffix(".xsl").exists()
|
||||
else basename.parent.joinpath(".default.xsl")
|
||||
)
|
||||
_run_process(target_file, processor, source_file, basename, lang)
|
||||
|
||||
|
||||
def _process_stylesheet(
|
||||
def _process_set(
|
||||
source_dir: Path,
|
||||
languages: list[str],
|
||||
target: Path,
|
||||
processor: Path,
|
||||
files: set[Path],
|
||||
) -> None:
|
||||
basename = get_basepath(processor)
|
||||
destination_base = target.joinpath(basename.relative_to(source_dir))
|
||||
for lang in languages:
|
||||
target_file = destination_base.with_suffix(
|
||||
f".{lang}{processor.with_suffix('').suffix}",
|
||||
# generate the transform func
|
||||
# doing it here should mean that we use only one per thread,
|
||||
# and also process each one only once
|
||||
# Max memory and performance efficacy
|
||||
parser = etree.XMLParser(remove_blank_text=True, remove_comments=True)
|
||||
xslt_tree = etree.parse(processor.resolve(), parser)
|
||||
transform = etree.XSLT(xslt_tree)
|
||||
for basepath, lang in product(files, languages):
|
||||
source_file = Path(str(basepath) + f".{lang}.xhtml")
|
||||
target_suffix = (
|
||||
".html" if (len(processor.suffixes) == 1) else processor.suffixes[0]
|
||||
)
|
||||
source_file = basename.with_suffix(f".{lang}.xhtml")
|
||||
_run_process(target_file, processor, source_file, basename, lang)
|
||||
target_file = target.joinpath(
|
||||
source_file.relative_to(source_dir),
|
||||
).with_suffix(target_suffix)
|
||||
# if the target file does not exist, we make it
|
||||
if not target_file.exists() or any(
|
||||
# If any source file is newer than the file to be generated
|
||||
# we recreate the generated file
|
||||
# if the source file does not exist, ignore it.
|
||||
(
|
||||
(file.exists() and file.stat().st_mtime > target_file.stat().st_mtime)
|
||||
for file in [
|
||||
(
|
||||
source_file
|
||||
if source_file.exists()
|
||||
else Path(str(basepath) + ".en.xhtml")
|
||||
),
|
||||
processor,
|
||||
(
|
||||
source_file.parent.joinpath("." + basepath.name).with_suffix(
|
||||
".xmllist",
|
||||
)
|
||||
),
|
||||
Path(f"global/data/texts/.texts.{lang}.xml"),
|
||||
Path(f"global/data/topbanner/.topbanner.{lang}.xml"),
|
||||
Path("global/data/texts/texts.en.xml"),
|
||||
]
|
||||
),
|
||||
):
|
||||
logger.debug("Building %s", target_file)
|
||||
result = process_file(source_file, transform)
|
||||
target_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
result.write_output(target_file)
|
||||
|
||||
|
||||
def process_files(
|
||||
@@ -96,29 +78,54 @@ def process_files(
|
||||
Build .html, .rss and .ics files from .xhtml sources
|
||||
|
||||
"""
|
||||
# TODO for performance it would be better to iterate by processor xls,
|
||||
# and parse it only once and pass the xsl object to called function.
|
||||
logger.info("Processing xhtml files")
|
||||
logger.info("Processing xhtml, rss, ics files")
|
||||
# generate a set of unique processing xsls
|
||||
xsl_files = {
|
||||
processor.resolve().relative_to(source_dir.parent.resolve())
|
||||
for processor in source_dir.glob("**/*.xsl")
|
||||
}
|
||||
|
||||
process_files_dict = {}
|
||||
for processor in xsl_files:
|
||||
process_files_dict[processor] = set()
|
||||
|
||||
# This gathers all the simple xhtml files for generating xhtml output
|
||||
for file in source_dir.glob("**/*.*.xhtml"):
|
||||
# Processors with a file ending for the output encoded in the name, eg
|
||||
# events.rss.xsl
|
||||
type_specific_processors = set(
|
||||
file.parent.glob(f"{get_basepath(file).name}.*.xsl")
|
||||
)
|
||||
# The form that a named processor for a file would be
|
||||
# Essentially, if there is no output name, we default to html
|
||||
# <basename>.html.xsl should be equal in effect to <basename>.xsl
|
||||
xhtml_named_processor = (
|
||||
file.with_suffix("")
|
||||
.with_suffix(".xsl")
|
||||
.resolve()
|
||||
.relative_to(source_dir.parent.resolve())
|
||||
)
|
||||
# if that does not exist, default to
|
||||
xhtml_default_processor = (
|
||||
file.parent.joinpath(".default.xsl")
|
||||
.resolve()
|
||||
.relative_to(source_dir.parent.resolve())
|
||||
)
|
||||
xhtml_processor = (
|
||||
xhtml_named_processor
|
||||
if xhtml_named_processor.exists()
|
||||
else xhtml_default_processor
|
||||
)
|
||||
# And add the files/processors we should be using for every file
|
||||
for processor in type_specific_processors | {xhtml_processor}:
|
||||
# If we are using a type specific processor, there will be more suffixes
|
||||
file_to_add = processor if len(processor.suffixes) > 1 else file
|
||||
process_files_dict[processor].add(get_basepath(file_to_add))
|
||||
|
||||
pool.starmap(
|
||||
_process_dir,
|
||||
_process_set,
|
||||
(
|
||||
(source_dir, languages, target, directory)
|
||||
for directory in {path.parent for path in source_dir.glob("**/*.*.xhtml")}
|
||||
),
|
||||
)
|
||||
logger.info("Processing rss files")
|
||||
pool.starmap(
|
||||
_process_stylesheet,
|
||||
(
|
||||
(source_dir, languages, target, processor)
|
||||
for processor in source_dir.glob("**/*.rss.xsl")
|
||||
),
|
||||
)
|
||||
logger.info("Processing ics files")
|
||||
pool.starmap(
|
||||
_process_stylesheet,
|
||||
(
|
||||
(source_dir, languages, target, processor)
|
||||
for processor in source_dir.glob("**/*.ics.xsl")
|
||||
(source_dir, languages, target, processor, files)
|
||||
for processor, files in process_files_dict.items()
|
||||
),
|
||||
)
|
||||
|
||||
@@ -7,7 +7,7 @@ from lxml import etree
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_xsl(tmp_path: Path) -> Path:
|
||||
def sample_xsl_transformer(tmp_path: Path) -> etree.XSLT:
|
||||
"""Minimal XSLT that just copies the input through."""
|
||||
xsl_path = tmp_path / "sample.xsl"
|
||||
xsl_path.write_text(
|
||||
@@ -26,7 +26,9 @@ def sample_xsl(tmp_path: Path) -> Path:
|
||||
""",
|
||||
).strip(),
|
||||
)
|
||||
return xsl_path
|
||||
parser = etree.XMLParser(remove_blank_text=True, remove_comments=True)
|
||||
xslt_tree = etree.parse(xsl_path.resolve(), parser)
|
||||
return etree.XSLT(xslt_tree)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -44,7 +46,7 @@ def sample_xsl(tmp_path: Path) -> Path:
|
||||
)
|
||||
def process_file_link_rewrites_test(
|
||||
tmp_path: Path,
|
||||
sample_xsl: Path,
|
||||
sample_xsl_transformer: etree.XSLT,
|
||||
lang: str,
|
||||
link_in: str,
|
||||
link_out: str,
|
||||
@@ -61,8 +63,9 @@ def process_file_link_rewrites_test(
|
||||
""",
|
||||
).strip(),
|
||||
)
|
||||
assert isinstance(sample_xsl_transformer, etree.XSLT)
|
||||
|
||||
result_doc = process_file(xml_path, sample_xsl)
|
||||
result_doc = process_file(xml_path, sample_xsl_transformer)
|
||||
assert isinstance(result_doc, etree._XSLTResultTree)
|
||||
# We get a list, but as we have only one link in the above sample
|
||||
# we only need to care about the first one
|
||||
|
||||
Reference in New Issue
Block a user