feat/overview (#5435)
All checks were successful
continuous-integration/drone/push Build is passing

- fixes issues with xml structure pre commit hook
- Moves comprea-files to lib
- adds a n overview page on status.fsfe.org to see what files have different structures

Co-authored-by: Darragh Elliott <me@delliott.net>
Reviewed-on: #5435
Co-authored-by: delliott <delliott@fsfe.org>
Co-committed-by: delliott <delliott@fsfe.org>
This commit was merged in pull request #5435.
This commit is contained in:
2025-10-28 05:27:09 +00:00
committed by tobiasd
parent 3fb41f1ff1
commit 81d0b63a6e
8 changed files with 249 additions and 95 deletions

View File

@@ -0,0 +1,86 @@
# SPDX-FileCopyrightText: Free Software Foundation Europe e.V. <https://fsfe.org>
#
# SPDX-License-Identifier: GPL-3.0-or-later
import logging
import sys
from pathlib import Path
from lxml import etree
logger = logging.getLogger(__name__)
def compare_files(
file1: Path,
file2: Path,
attr_whitelist: set[str] | None = None,
_path: str = "",
) -> list[str]:
try:
t1, t2 = etree.parse(file1), etree.parse(file2)
except etree.XMLSyntaxError as e:
logger.critical("XML parse error: %s", e)
sys.exit(1)
return compare_elements(t1.getroot(), t2.getroot(), attr_whitelist)
def compare_elements(
elem1: etree.Element,
elem2: etree.Element,
attr_whitelist: set[str] | None = None,
_path: str = "",
) -> list[str]:
"""
Recursively compare two XML elements.
Returns a list of short, informative error strings.
"""
if attr_whitelist is None:
attr_whitelist = set()
errors: list[str] = []
tag_path = f"{_path}/{elem1.tag}" if _path else elem1.tag
# tag mismatch
if elem1.tag != elem2.tag:
errors.append(f"Tag mismatch at {tag_path}: {elem1.tag}{elem2.tag}")
return errors # if tags differ, stop descending
# attribute deltas
attributes_of_elem1 = dict(elem1.attrib.items())
attributes_of_elem2 = dict(elem2.attrib.items())
only_in_elem1 = set(attributes_of_elem1) - set(attributes_of_elem2)
only_in_elem2 = set(attributes_of_elem2) - set(attributes_of_elem1)
common = set(attributes_of_elem1) & set(attributes_of_elem2)
if only_in_elem1 or only_in_elem2:
errors.append(
f"Attribute delta at <{elem1.tag}>"
f" only 1: {list(only_in_elem1)} only 2: {list(only_in_elem2)}"
)
for key in common:
if (
attributes_of_elem1[key] != attributes_of_elem2[key]
and key not in attr_whitelist
):
error_msg = (
f"Attribute value diff at <{elem1.tag} {key}>:"
f" {attributes_of_elem1[key]!r}{attributes_of_elem2[key]!r}"
)
errors.append(error_msg)
# child count
kids1 = list(elem1)
kids2 = list(elem2)
if len(kids1) != len(kids2):
errors.append(f"Child count at <{elem1.tag}>: {len(kids1)}{len(kids2)}")
return errors # if counts differ, stop descending
# and then recurse into children
for idx, (child1, child2) in enumerate(zip(kids1, kids2, strict=False), start=1):
errors.extend(
compare_elements(child1, child2, attr_whitelist, _path=f"{tag_path}[{idx}]")
)
return errors

View File

@@ -51,6 +51,12 @@
<a href="translations/">Translation status</a>
</li>
</ul>
<h2>Files with mismatched xml structure</h2>
<ul>
<li>
<a href="xml_structure/">XML structure status</a>
</li>
</ul>
</body>
</html>

View File

@@ -0,0 +1,6 @@
# SPDX-FileCopyrightText: Free Software Foundation Europe e.V. <https://fsfe.org>
#
# SPDX-License-Identifier: GPL-3.0-or-later
# __init__.py is a special Python file that allows a directory to become
# a Python package so it can be accessed using the 'import' statement.

View File

@@ -0,0 +1,32 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
<xsl:import href="../../fsfe.org/fsfe.xsl"/>
<xsl:template match="xml-structure-status">
<div class="xml-structure-status">
<h2>Status</h2>
<xsl:for-each select="/buildinfo/document/set/master">
<xsl:sort select="@name" order="ascending"/>
<details>
<summary>
<xsl:value-of select="@name"/>
</summary>
<ul>
<xsl:for-each select="detail">
<li>
<a>
<xsl:attribute name="href">
<xsl:text>https://git.fsfe.org/FSFE/fsfe-website/src/branch/master/</xsl:text>
<xsl:value-of select="@name"/>
</xsl:attribute>
<xsl:value-of select="@name"/>
</a>
<xsl:text>: </xsl:text>
<xsl:value-of select="@error"/>
</li>
</xsl:for-each>
</ul>
</details>
</xsl:for-each>
</div>
</xsl:template>
</xsl:stylesheet>

View File

@@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<html external="true">
<version>1</version>
<head>
<title>FSFE XML Structure Status</title>
</head>
<body>
<h1>XML Structure Status</h1>
<p>Here we have the output of a simple script to generate a log of what files have unmatched xml structure.</p>
<p>The aim of this is to make it easier to have some idea of what files we should be working on.</p>
<xml-structure-status/>
</body>
</html>

View File

@@ -0,0 +1 @@
status.fsfe.org/xml_structure/data/*:[]

View File

@@ -0,0 +1,98 @@
# SPDX-FileCopyrightText: Free Software Foundation Europe e.V. <https://fsfe.org>
#
# SPDX-License-Identifier: GPL-3.0-or-later
import logging
import multiprocessing
from collections import defaultdict
from pathlib import Path
from fsfe_website_build.lib.checks import compare_files
from fsfe_website_build.lib.misc import (
get_basepath,
get_version,
lang_from_filename,
run_command,
update_if_changed,
)
from lxml import etree
logger = logging.getLogger(__name__)
def _job(
master: Path, other: Path, whitelist: set[str]
) -> tuple[Path, Path, str] | None:
"""Return a one-line result string for starmap."""
if get_version(master) != get_version(other):
return None
errs = compare_files(master, other, whitelist)
return (master, other, f"{'; '.join(errs)}") if errs else None
def run(source: Path, languages: list[str], processes: int, working_dir: Path) -> None: # noqa: ARG001
"""
Build xml-structure log for displaying on a status page
Xmls are placed in target_dir, and only passed languages are processed.
"""
target_dir = working_dir.joinpath("data/")
logger.debug("Building index of status of xml structure into dir %s", target_dir)
all_git_tracked_files = run_command(
["git", "ls-files", "-z"],
)
all_files = {
# Split on null bytes, strip and then parse into path
path
for path in (Path(line.strip()) for line in all_git_tracked_files.split("\x00"))
if path.suffix in [".xhtml", ".xml"]
and len(path.suffixes) >= 2 # noqa: PLR2004
and lang_from_filename(path) in languages
}
whitelist = {"alt"}
groups: defaultdict[Path, list[Path]] = defaultdict(list)
for file in all_files:
path = Path(file)
groups[get_basepath(path)].append(path)
tasks: list[tuple[Path, Path, set[str]]] = []
for basepath, paths in groups.items():
master = next(
(path for path in paths if lang_from_filename(path) == "en"),
None,
)
if not master:
logger.debug("No english translation of %s - skipped", basepath)
continue
tasks.extend((master, path, whitelist) for path in paths if path != master)
with multiprocessing.Pool(processes) as pool:
filtered_results = [
result for result in pool.starmap(_job, tasks) if result is not None
]
# Build dict: master: list of (other, message)
tree: dict[Path, list[tuple[Path, str]]] = defaultdict(list)
for master, other, message in filtered_results:
tree[master].append((other, message))
# Generate XML
work_file = target_dir.joinpath("xml-structure-status.en.xml")
target_dir.mkdir(parents=True, exist_ok=True)
root = etree.Element("xml-structure-status")
version_el = etree.SubElement(root, "version")
version_el.text = "1"
for master, details in tree.items():
master_el = etree.SubElement(root, "master", name=str(master))
for other, msg in details:
etree.SubElement(
master_el,
"detail",
name=str(other),
error=msg,
)
xml_bytes = etree.tostring(root, xml_declaration=True, encoding="utf-8")
update_if_changed(work_file, xml_bytes.decode("utf-8"))

View File

@@ -6,117 +6,29 @@ import sys
from collections import defaultdict
from pathlib import Path
from fsfe_website_build.lib.checks import (
compare_files,
)
from fsfe_website_build.lib.misc import (
get_basepath,
get_version,
lang_from_filename,
)
from lxml import etree
logger = logging.getLogger(__name__)
def compare_elements(
elem1: etree.Element,
elem2: etree.Element,
attr_whitelist: set[str] | None = None,
_path: str = "",
) -> list[str]:
"""
Recursively compare two XML elements.
Returns a list of short, informative error strings.
"""
if attr_whitelist is None:
attr_whitelist = set()
errors: list[str] = []
tag_path = f"{_path}/{elem1.tag}" if _path else elem1.tag
# tag mismatch
if elem1.tag != elem2.tag:
errors.append(f"Tag mismatch at {tag_path}: {elem1.tag}{elem2.tag}")
return errors # if tags differ, stop descending
# attribute deltas
attributes_of_elem1 = dict(elem1.attrib.items())
attributes_of_elem2 = dict(elem2.attrib.items())
only_in_elem1 = set(attributes_of_elem1) - set(attributes_of_elem2)
only_in_elem2 = set(attributes_of_elem2) - set(attributes_of_elem1)
common = set(attributes_of_elem1) & set(attributes_of_elem2)
if only_in_elem1 or only_in_elem2:
errors.append(
f"Attribute delta at <{elem1.tag}>"
f" only 1: {list(only_in_elem1)} only 2: {list(only_in_elem2)}"
)
for key in common:
if (
attributes_of_elem1[key] != attributes_of_elem2[key]
and key not in attr_whitelist
):
error_msg = (
f"Attribute value diff at <{elem1.tag} {key}>:"
f" {attributes_of_elem1[key]!r}{attributes_of_elem2[key]!r}"
)
errors.append(error_msg)
# child count
kids1 = list(elem1)
kids2 = list(elem2)
if len(kids1) != len(kids2):
errors.append(f"Child count at <{elem1.tag}>: {len(kids1)}{len(kids2)}")
return errors # if counts differ, stop descending
# and then recurse into children
for idx, (child1, child2) in enumerate(zip(kids1, kids2, strict=False), start=1):
errors.extend(
compare_elements(child1, child2, attr_whitelist, _path=f"{tag_path}[{idx}]")
)
return errors
def _job(master: Path, other: Path, whitelist: set[str]) -> str | None:
"""Return a one-line result string for starmap."""
try:
if get_version(master) != get_version(other):
return f"{other}: version differs → OK"
tree1, tree2 = etree.parse(master), etree.parse(other)
errs = compare_elements(tree1.getroot(), tree2.getroot(), whitelist)
return None
errs = compare_files(master, other, whitelist)
return f"{other}: {'; '.join(errs)}" if errs else None
except Exception as e:
return f"{other}: ERROR {e}"
def compare_two_files(file1: Path, file2: Path, whitelist: set[str]) -> None:
"""
Compares the xml structure of two files.
Exits early if they have different versions
"""
try:
version_1, version_2 = get_version(file1), get_version(file2)
except ValueError as e:
logger.critical("Version check failed: %s", e)
sys.exit(2)
if version_1 != version_2:
logger.info("Files are different versions, considering comparison okay")
return
try:
t1, t2 = etree.parse(file1), etree.parse(file2)
except etree.XMLSyntaxError as e:
logger.critical("XML parse error: %s", e)
sys.exit(1)
errors = compare_elements(t1.getroot(), t2.getroot(), whitelist)
if errors:
logger.warning("Differences found:\n%s", "\n".join(errors))
sys.exit(1)
else:
logger.info("XML files match in structure and attributes.")
def main() -> None:
parser = argparse.ArgumentParser(
description="Compare XML structure and attributes. "
@@ -172,9 +84,9 @@ def main() -> None:
tasks.extend((master, path, args.whitelist) for path in paths if path != master)
with multiprocessing.Pool(processes=args.jobs) as pool:
filtered_results = (
filtered_results = [
result for result in pool.starmap(_job, tasks) if result is not None
)
]
if filtered_results:
for result in filtered_results:
logger.info(result)