Files
fsfe-website/build/lib/process_file.py
delliott 4468cf2337 feat/python-rewrite (#4762)
Rewrite the whole build process in python, for superior speed and maintenance

Co-authored-by: Darragh Elliott <me@delliott.xyz>
Co-authored-by: Sofía Aritz <sofiaritz@fsfe.org>
Co-authored-by: tobiasd <tobiasd@fsfe.org>
Co-authored-by: Tobias Diekershoff <tobiasd@fsfe.org>
Reviewed-on: FSFE/fsfe-website#4762
Co-authored-by: delliott <delliott@fsfe.org>
Co-committed-by: delliott <delliott@fsfe.org>
2025-04-24 12:56:29 +00:00

249 lines
7.5 KiB
Python

# SPDX-FileCopyrightText: Free Software Foundation Europe e.V. <https://fsfe.org>
#
# SPDX-License-Identifier: GPL-3.0-or-later
import logging
import re
from datetime import datetime
from pathlib import Path
import lxml.etree as etree
from build.lib.misc import get_basename, get_version, lang_from_filename
logger = logging.getLogger(__name__)
def _include_xml(file: Path) -> str:
"""
include second level elements of a given XML file
this emulates the behaviour of the original
build script which wasn't able to load top
level elements from any file
"""
work_str = ""
if file.exists():
tree = etree.parse(file)
root = tree.getroot()
# Remove <version> because the filename attribute would otherwise be added
# to this element instead of the actual content element.
for elem in root.xpath("version"):
root.remove(elem)
# Iterate over all elements in root node, add a filename attribute and then append the string to work_str
for elem in root.xpath("*"):
elem.set("filename", get_basename(file))
work_str += etree.tostring(elem, encoding="utf-8").decode("utf-8")
return work_str
def _get_attributes(file: Path) -> str:
"""
get attributes of top level element in a given
XHTML file
"""
work_str = ""
tree = etree.parse(file)
root = tree.getroot()
attributes = root.attrib
for attrib in attributes:
work_str += f'{attrib}="{attributes[attrib]}"\n'
return work_str
def _list_langs(file: Path) -> str:
"""
list all languages a file exists in by globbing up
the shortname (i.e. file path with file ending omitted)
output is readily formatted for inclusion
in xml stream
"""
return "\n".join(
list(
map(
lambda path: (
f'<tr id="{lang_from_filename(path)}">'
+ (
Path(f"global/languages/{lang_from_filename(path)}")
.read_text()
.strip()
if Path(f"global/languages/{lang_from_filename(path)}").exists()
else lang_from_filename(path)
)
+ "</tr>"
),
file.parent.glob(f"{get_basename(file)}.??{file.suffix}"),
)
)
)
def _auto_sources(action_file: Path, lang: str) -> str:
"""
import elements from source files, add file name
attribute to first element included from each file
"""
work_str = ""
list_file = action_file.with_stem(
f".{action_file.with_suffix('').stem}"
).with_suffix(".xmllist")
if list_file.exists():
with list_file.open("r") as file:
for path in map(lambda line: Path(line.strip()), file):
path_xml = (
path.with_suffix(f".{lang}.xml")
if path.with_suffix(f".{lang}.xml").exists()
else path.with_suffix(".en.xml")
)
work_str += _include_xml(path_xml)
return work_str
def _build_xmlstream(infile: Path):
"""
assemble the xml stream for feeding into xsltproc
the expected shortname and language flag indicate
a single xhtml page to be built
"""
# TODO
# Ideally this would use lxml to construct an object instead of string templating.
# Should be a little faster, and also guarantees that its valid xml
logger.debug(f"infile: {infile}")
shortname = infile.with_suffix("")
lang = lang_from_filename(infile)
glob = infile.parent.joinpath(f"{get_basename(infile)}.??{infile.suffix}")
logger.debug(f"formed glob: {glob}")
lang_lst = list(
infile.parent.glob(f"{get_basename(infile)}.??{infile.suffix}"),
)
logger.debug(f"file lang list: {lang_lst}")
original_lang = (
"en"
if infile.with_suffix("").with_suffix(f".en{infile.suffix}").exists()
else sorted(
infile.parent.glob(f"{get_basename(infile)}.??{infile.suffix}"),
key=get_version,
reverse=True,
)[0]
.with_suffix("")
.suffix.removeprefix(".")
)
topbanner_xml = Path(f"global/data/topbanner/.topbanner.{lang}.xml")
texts_xml = Path(f"global/data/texts/.texts.{lang}.xml")
date = str(datetime.now().date())
# time = str(datetime.now().time())
action_lang = ""
translation_state = ""
if infile.exists():
action_lang = lang
original_version = get_version(
shortname.with_suffix(f".{original_lang}{infile.suffix}")
)
lang_version = get_version(shortname.with_suffix(f".{lang}{infile.suffix}"))
translation_state = (
"up-to-date"
if (original_version <= lang_version)
else (
"very-outdated"
if (original_version - 3 >= lang_version)
else "outdated"
)
)
else:
action_lang = original_lang
translation_state = "untranslated"
action_file = shortname.with_suffix(f".{action_lang}{infile.suffix}")
logger.debug(f"action_file: {action_file}")
result_str = f"""
<buildinfo
date="{date}"
original="{original_lang}"
filename="/{str(shortname.with_suffix("")).removeprefix("/")}"
fileurl="/{shortname.relative_to(shortname.parts[0]).with_suffix("")}"
dirname="/{shortname.parent}/"
language="{lang}"
translation_state="{translation_state}"
>
<trlist>
{_list_langs(infile)}
</trlist>
<topbanner>
{_include_xml(topbanner_xml)}
</topbanner>
<textsetbackup>
{_include_xml(Path("global/data/texts/texts.en.xml"))}
</textsetbackup>
<textset>
{_include_xml(texts_xml)}
</textset>
<document
language="{action_lang}"
{_get_attributes(action_file)}
>
<set>
{_auto_sources(action_file, lang)}
</set>
{_include_xml(action_file)}
</document>
</buildinfo>
"""
return result_str
def process_file(infile: Path, processor: Path) -> str:
"""
Process a given file using the correct xsl sheet
"""
logger.debug(f"Processing {infile}")
lang = lang_from_filename(infile)
xmlstream = _build_xmlstream(infile)
xslt_tree = etree.parse(processor.resolve())
transform = etree.XSLT(xslt_tree)
result = str(transform(etree.XML(xmlstream)))
# And now a bunch of regexes to fix some links.
# xx is the language code in all comments
# TODO
# Probably a faster way to do this
# Maybe iterating though all a tags with lxml?
# Once buildxmlstream generates an xml object that should be faster.
# Remove https://fsfe.org (or https://test.fsfe.org) from the start of all
result = re.sub(
r"""href\s*=\s*("|')(https?://(test\.)?fsfe\.org)([^>])\1""",
r"""href=\1\3\1""",
result,
flags=re.MULTILINE | re.IGNORECASE,
)
# Change links from /foo/bar.html into /foo/bar.xx.html
# Change links from foo/bar.html into foo/bar.xx.html
# Same for .rss and .ics links
result = re.sub(
r"""href\s*=\s*("|')(/?([^:>]+/)?[^:/.]+\.)(html|rss|ics)(#[^>]*)?\1""",
rf"""href=\1\2{lang}.\4\5\1""",
result,
flags=re.MULTILINE | re.IGNORECASE,
)
# Change links from /foo/bar/ into /foo/bar/index.xx.html
# Change links from foo/bar/ into foo/bar/index.xx.html
result = re.sub(
r"""href\s*=\s*("|')(/?[^:>]+/)\1""",
rf"""href=\1\2index.{lang}.html\1""",
result,
flags=re.MULTILINE | re.IGNORECASE,
)
return result