bugfix: handling non-UTF-8 files in tagging

svn path=/trunk/; revision=33448
This commit is contained in:
paul 2016-06-06 14:32:16 +00:00
parent 90a8fd8130
commit 354977821d
2 changed files with 13 additions and 10 deletions

View File

@ -13,16 +13,16 @@ validate_tagmap(){
map_tags(){
for xml in "$@"; do
printf '%s ' "$xml"
sed -rn ':a;N;$!ba
unicat "$xml" \
| sed -rn ':a;N;$!ba
s;<!([^>]|<[^>]*>)*>;;g
s;[\n\t ]+; ;g
s; ?([</>]) ?;\1;g
s;[\n\t ]+; ;g; s; ?([</>]) ?;\1;g
tb;Tb;:b
s;.*<tags( [^>]+)?>[^<]*<tag( [^>]+)?>(.*)</tag>[^<]*</tags>.*;\3;;Tc
s; ;+;g
s;</tag>[^<]*<tag(\+[^>]+)?>; ;g;p;q
:c;a\
' "$xml"
'
done
}

View File

@ -2,6 +2,14 @@
inc_xmlfiles=true
unicat(){
# convert XML files to UTF-8
for file in "$@"; do
enc="$(sed -nr 'bA; :Q q; :A s:^.*<\?.*encoding="([^"]+)".*$:\1:p; tQ' "$file")"
iconv -f "${enc:-UTF-8}" -t "UTF-8" "$file"
done
}
include_xml(){
# include second level elements of a given XML file
# this emulates the behaviour of the original
@ -10,12 +18,7 @@ include_xml(){
file="$1"
if [ -f "$file" ]; then
# guess encoding from xml header
# we will convert everything to utf-8 prior to processing
enc="$(sed -nr 's:^.*<\?.*encoding="([^"]+)".*$:\1:p' "$file")"
[ -z "$enc" ] && enc="UTF-8"
iconv -f "$enc" -t "UTF-8" "$file" \
unicat "$file" \
| sed -r ':X; $bY; N; bX; :Y;
s:<(\?[xX][mM][lL]|!DOCTYPE)[[:space:]]+[^>]+>::g
s:<[^!][^>]*>::;