Browse Source

bugfix: handling non-UTF-8 files in tagging

svn path=/trunk/; revision=33448
tags/stw2018
paul 3 years ago
parent
commit
354977821d
2 changed files with 13 additions and 10 deletions
  1. 4
    4
      build/sources.sh
  2. 9
    6
      build/xmlfiles.sh

+ 4
- 4
build/sources.sh View File

@@ -13,16 +13,16 @@ validate_tagmap(){
map_tags(){
for xml in "$@"; do
printf '%s ' "$xml"
sed -rn ':a;N;$!ba
unicat "$xml" \
| sed -rn ':a;N;$!ba
s;<!([^>]|<[^>]*>)*>;;g
s;[\n\t ]+; ;g
s; ?([</>]) ?;\1;g
s;[\n\t ]+; ;g; s; ?([</>]) ?;\1;g
tb;Tb;:b
s;.*<tags( [^>]+)?>[^<]*<tag( [^>]+)?>(.*)</tag>[^<]*</tags>.*;\3;;Tc
s; ;+;g
s;</tag>[^<]*<tag(\+[^>]+)?>; ;g;p;q
:c;a\
' "$xml"
'
done
}


+ 9
- 6
build/xmlfiles.sh View File

@@ -2,6 +2,14 @@

inc_xmlfiles=true

unicat(){
# convert XML files to UTF-8
for file in "$@"; do
enc="$(sed -nr 'bA; :Q q; :A s:^.*<\?.*encoding="([^"]+)".*$:\1:p; tQ' "$file")"
iconv -f "${enc:-UTF-8}" -t "UTF-8" "$file"
done
}

include_xml(){
# include second level elements of a given XML file
# this emulates the behaviour of the original
@@ -10,12 +18,7 @@ include_xml(){
file="$1"

if [ -f "$file" ]; then
# guess encoding from xml header
# we will convert everything to utf-8 prior to processing
enc="$(sed -nr 's:^.*<\?.*encoding="([^"]+)".*$:\1:p' "$file")"
[ -z "$enc" ] && enc="UTF-8"

iconv -f "$enc" -t "UTF-8" "$file" \
unicat "$file" \
| sed -r ':X; $bY; N; bX; :Y;
s:<(\?[xX][mM][lL]|!DOCTYPE)[[:space:]]+[^>]+>::g
s:<[^!][^>]*>::;

Loading…
Cancel
Save