feat/lxml (#5181)
All checks were successful
continuous-integration/drone/push Build is passing

#5146 attempt 2, texts and topbanner issue resolved.

After the issues I have deployed this to https://test.fsfe.org, so we can see if there are further issues there.

It seems not though.

Co-authored-by: Darragh Elliott <me@delliott.xyz>
Reviewed-on: #5181
Co-authored-by: delliott <delliott@fsfe.org>
Co-committed-by: delliott <delliott@fsfe.org>
This commit was merged in pull request #5181.
This commit is contained in:
2025-07-29 08:17:53 +00:00
committed by tobiasd
parent 4ce0777468
commit f08d5ef925
14 changed files with 130 additions and 123 deletions

View File

@@ -14,76 +14,61 @@ from build.lib.misc import get_basename, get_version, lang_from_filename
logger = logging.getLogger(__name__)
def _include_xml(file: Path) -> str:
def _get_xmls(file: Path, parser: etree.XMLParser) -> etree.Element:
"""
include second level elements of a given XML file
this emulates the behaviour of the original
build script which wasn't able to load top
level elements from any file
"""
work_str = ""
elements = []
if file.exists():
tree = etree.parse(file)
tree = etree.parse(file, parser)
root = tree.getroot()
# Remove <version> because the filename attribute would otherwise be added
# to this element instead of the actual content element.
for elem in root.xpath("version"):
root.remove(elem)
# Iterate over all elements in root node, add a filename attribute and
# then append the string to work_str
for elem in root.xpath("*"):
elem.set("filename", get_basename(file))
work_str += etree.tostring(elem, encoding="utf-8").decode("utf-8")
return work_str
elements.append(elem)
# and then we return the element
return elements
def _get_attributes(file: Path) -> str:
def _get_attributes(file: Path) -> dict:
"""
get attributes of top level element in a given
XHTML file
"""
work_str = ""
tree = etree.parse(file)
root = tree.getroot()
attributes = root.attrib
for attrib in attributes:
work_str += f'{attrib}="{attributes[attrib]}"\n'
return work_str
attributes = root.items()
return dict(attributes)
def _list_langs(file: Path) -> str:
def _get_trlist(file: Path) -> etree.Element:
"""
list all languages a file exists in by globbing up
the shortname (i.e. file path with file ending omitted)
output is readily formatted for inclusion
in xml stream
"""
return "\n".join(
list(
map(
lambda path: (
f'<tr id="{lang_from_filename(path)}">'
+ (
Path(f"global/languages/{lang_from_filename(path)}")
.read_text()
.strip()
)
+ "</tr>"
),
file.parent.glob(f"{get_basename(file)}.??{file.suffix}"),
)
trlist = etree.Element("trlist")
for path in file.parent.glob(f"{get_basename(file)}.??{file.suffix}"):
tr = etree.SubElement(trlist, "tr", id=lang_from_filename(path))
tr.text = (
Path(f"global/languages/{lang_from_filename(path)}").read_text().strip()
)
)
return trlist
def _auto_sources(action_file: Path, lang: str) -> str:
def _get_set(action_file: Path, lang: str, parser: etree.XMLParser) -> etree.Element:
"""
import elements from source files, add file name
attribute to first element included from each file
"""
work_str = ""
doc_set = etree.Element("set")
list_file = action_file.with_stem(
f".{action_file.with_suffix('').stem}"
).with_suffix(".xmllist")
@@ -96,21 +81,28 @@ def _auto_sources(action_file: Path, lang: str) -> str:
if path.with_suffix(f".{lang}.xml").exists()
else path.with_suffix(".en.xml")
)
work_str += _include_xml(path_xml)
doc_set.extend(_get_xmls(path_xml, parser))
return work_str
return doc_set
def _build_xmlstream(infile: Path):
def _get_document(
action_lang: str, action_file: Path, lang: str, parser: etree.XMLParser
) -> etree.Element:
document = etree.Element(
"document", language=action_lang, **_get_attributes(action_file)
)
document.append(_get_set(action_file, lang, parser))
document.extend(_get_xmls(action_file, parser))
return document
def _build_xmlstream(infile: Path, parser: etree.XMLParser) -> etree.Element:
"""
assemble the xml stream for feeding into xsltproc
the expected shortname and language flag indicate
a single xhtml page to be built
"""
# TODO
# Ideally this would use lxml to construct an object instead of string templating.
# Should be a little faster, and also guarantees that its valid xml
logger.debug(f"infile: {infile}")
shortname = infile.with_suffix("")
lang = lang_from_filename(infile)
@@ -134,7 +126,6 @@ def _build_xmlstream(infile: Path):
topbanner_xml = Path(f"global/data/topbanner/.topbanner.{lang}.xml")
texts_xml = Path(f"global/data/texts/.texts.{lang}.xml")
date = str(datetime.now().date())
# time = str(datetime.now().time())
action_lang = ""
translation_state = ""
@@ -159,41 +150,34 @@ def _build_xmlstream(infile: Path):
action_file = shortname.with_suffix(f".{action_lang}{infile.suffix}")
logger.debug(f"action_file: {action_file}")
# Create the root element
page = etree.Element(
"buildinfo",
date=date,
original=original_lang,
filename=f"/{str(shortname.with_suffix('')).removeprefix('/')}",
fileurl=f"/{shortname.relative_to(shortname.parts[0]).with_suffix('')}",
dirname=f"/{shortname.parent}/",
language=lang,
translation_state=translation_state,
)
result_str = f"""
<buildinfo
date="{date}"
original="{original_lang}"
filename="/{str(shortname.with_suffix("")).removeprefix("/")}"
fileurl="/{shortname.relative_to(shortname.parts[0]).with_suffix("")}"
dirname="/{shortname.parent}/"
language="{lang}"
translation_state="{translation_state}"
>
<trlist>
{_list_langs(infile)}
</trlist>
<topbanner>
{_include_xml(topbanner_xml)}
</topbanner>
<textsetbackup>
{_include_xml(Path("global/data/texts/texts.en.xml"))}
</textsetbackup>
<textset>
{_include_xml(texts_xml)}
</textset>
<document
language="{action_lang}"
{_get_attributes(action_file)}
>
<set>
{_auto_sources(action_file, lang)}
</set>
{_include_xml(action_file)}
</document>
</buildinfo>
"""
return result_str
# Add the subelements
page.append(_get_trlist(infile))
# Make the relevant subelmenets
# and then add the relevant xmls to it
topbanner = etree.SubElement(page, "topbanner")
topbanner.extend(_get_xmls(topbanner_xml, parser))
textsetbackup = etree.SubElement(page, "textsetbackup")
textsetbackup.extend(_get_xmls(Path("global/data/texts/texts.en.xml"), parser))
textset = etree.SubElement(page, "textset")
textset.extend(_get_xmls(texts_xml, parser))
page.append(_get_document(action_lang, action_file, lang, parser))
return page
def process_file(infile: Path, processor: Path) -> str:
@@ -202,41 +186,54 @@ def process_file(infile: Path, processor: Path) -> str:
"""
logger.debug(f"Processing {infile}")
lang = lang_from_filename(infile)
xmlstream = _build_xmlstream(infile)
xslt_tree = etree.parse(processor.resolve())
parser = etree.XMLParser(remove_blank_text=True, remove_comments=True)
xmlstream = _build_xmlstream(infile, parser)
xslt_tree = etree.parse(processor.resolve(), parser)
transform = etree.XSLT(xslt_tree)
result = str(transform(etree.XML(xmlstream)))
result = transform(xmlstream)
# And now a bunch of regexes to fix some links.
# xx is the language code in all comments
# TODO
# Probably a faster way to do this
# Maybe iterating though all a tags with lxml?
# Once buildxmlstream generates an xml object that should be faster.
# Remove https://fsfe.org (or https://test.fsfe.org) from the start of all
result = re.sub(
r"""href\s*=\s*("|')(https?://(test\.)?fsfe\.org)([^>])\1""",
r"""href=\1\3\1""",
result,
flags=re.MULTILINE | re.IGNORECASE,
)
# Change links from /foo/bar.html into /foo/bar.xx.html
# Change links from foo/bar.html into foo/bar.xx.html
# Same for .rss and .ics links
result = re.sub(
r"""href\s*=\s*("|')(/?([^:>]+/)?[^:/.]+\.)(html|rss|ics)(#[^>]*)?\1""",
rf"""href=\1\2{lang}.\4\5\1""",
result,
flags=re.MULTILINE | re.IGNORECASE,
)
# Change links from /foo/bar/ into /foo/bar/index.xx.html
# Change links from foo/bar/ into foo/bar/index.xx.html
result = re.sub(
r"""href\s*=\s*("|')(/?[^:>]+/)\1""",
rf"""href=\1\2index.{lang}.html\1""",
result,
flags=re.MULTILINE | re.IGNORECASE,
)
try:
for linkelem in result.xpath("//*[@href]"):
# remove any spurious whitespace
linkelem.set(
"href",
linkelem.get("href").strip(),
)
# Remove https://fsfe.org (or https://test.fsfe.org)
# from the start of all links
linkelem.set(
"href",
re.sub(
r"""^(https?://(test\.)?fsfe\.org)""",
"",
linkelem.get("href"),
flags=re.IGNORECASE,
),
)
# Change links from /foo/bar.html into /foo/bar.xx.html
# Change links from foo/bar.html into foo/bar.xx.html
# Same for .rss and .ics links
linkelem.set(
"href",
re.sub(
r"""^(/?([^:>]+/)?[^:/.]{3,}\.)(html|rss|ics)""",
rf"""\1{lang}.\3""",
linkelem.get("href"),
flags=re.IGNORECASE,
),
)
# Change links from /foo/bar/ into /foo/bar/index.xx.html
# Change links from foo/bar/ into foo/bar/index.xx.html
linkelem.set(
"href",
re.sub(
r"""^(/?[^:>]+/)$""",
rf"""\1index.{lang}.html""",
linkelem.get("href"),
flags=re.IGNORECASE,
),
)
except AssertionError:
logger.debug(f"Output generated for file {infile} is not valid xml")
return result

View File

@@ -16,5 +16,5 @@ def full() -> None:
"""
logger.info("Performing a full rebuild, git cleaning")
run_command(
["git", "clean", "-fdx", "--exclude", "/.venv"],
["git", "clean", "-fdx", "--exclude", "/.venv", "--exclude", "/.nltk_data"],
)

View File

@@ -45,7 +45,7 @@ def _run_process(
logger.debug(f"Building {target_file}")
result = process_file(source_file, processor)
target_file.parent.mkdir(parents=True, exist_ok=True)
target_file.write_text(result)
result.write_output(target_file)
def _process_dir(

View File

@@ -8,7 +8,7 @@
<!-- ====================================================================== -->
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="text" encoding="UTF-8"/>
<xsl:output method="text" encoding="utf-8"/>
<xsl:template match="version">
<xsl:value-of select="."/>