#!/bin/sh -
#
# do things with a saved Confluence backup package.
# this expects backups saved from Confluence clowd (yes, "clowd").
#
# $Header: /local/adm/RCS/conflu-bkup-proc,v 1.55 2021/11/16 03:10:44 pkern Exp $
PATH=/local/bin:/usr/local/bin:$PATH
export PATH
self="${0}"
prog=`basename "${self}"`
libd=./lib
cmd="${1}"
case "${cmd}" in
usage)
exec >&2
cat << __EoUsage__
usage:
${self} diff-ziplists {list_1} {list_2}
${self} dig-xml {zipfile} {zipfile-list}
${self} obj-dump {confluence-xml}
${self} xml-digest {confluence-xml}
${self} xml-unpack {confluence-xml} {bodies-dir}
${self} digest-to-csv
${self} cvt-xhtml [ {confluence-page} ]
${self} xmlfmt
__EoUsage__
exit 1
;;
dig-xml)
# extract XML from Confluence backup file.
want=entities.xml
zipf="${2}"
if [ ! -s "${zipf}" ] ; then
echo "${self}: not a zip file: '${zipf}'" >&2
exit 1
fi
zlist="${3}"
if [ ! -s "${zlist}" ] ; then
echo "${self}: not a zipfile list? '${zlist}'" >&2
exit 1
fi
has=`fgrep "${want}" ${zlist}`
if [ -z "${has}" ] ; then
echo "${self}: zip-list does not include '${want}'" >&2
exit 1
fi
unzip -p ${zipf} ${want} |
${self} xmlfmt
exit
;;
xmlfmt)
# make XML more readable.
shift
exec xmlindent -nas -nbe -i 2 -l 72 -f "$@"
# not reached.
;;
diff-ziplists)
# Confluence zip files have weird datestamps. ignore them.
tmp=/tmp/${prog},$$
trap '/bin/rm -f ${tmp}* ; exit' 0 1 2 3 15
f1="${2}" f2="${3}"
awk '{ print $1, $3, $7, $NF }' "${f1}" > ${tmp},1
awk '{ print $1, $3, $7, $NF }' "${f2}" > ${tmp},2
diff -c0 ${tmp},[12]
exit
;;
obj-dump)
# input: Confluence backup xml
# output: simple dump of objects.
tmpxsl=/tmp/${prog},$$,xsl
trap "/bin/rm -f ${tmpxsl}*; exit" 0 1 2 3 15
# https://stackoverflow.com/questions/25662151/padding-number-with-leading-zeros-in-xslt-1-0
cat << __EOxsl__ > ${tmpxsl}
id class space version ;
timestamp
title
__EOxsl__
xsltproc ${tmpxsl} "${@}"
exit
;;
xml-digest|xml-unpack)
#
# process XML from Confluence backup pkg.
#
# digest = output a simple stream of objects, suitable for awk.
# unpack = save BodyContent objects to individual files.
#
xslwork=""
xsopts=""
xmlfn="${2}" ; [ -z "${xmlfn}" ] && exec ${self} usage
case "${cmd}" in
*-digest)
xslwork=${libd}/cnflu-bk-digest-xml.xsl
;;
*-unpack)
outd="${3}" ; [ -z "${outd}" ] && exec ${self} usage
xslwork=${libd}/cnflu-bk-unpack-bdy.xsl
if [ ! -d "${outd}" ] ; then
echo "${self} : ${cmd} - no directory '${outd}'" >&2
exit 1
fi
xsopts="--stringparam savedir ${outd}/"
;;
esac
for fn in ${xslwork} ${xmlfn}
do
if [ ! -s "${fn}" ] ; then
echo "${self} : ${cmd} - no such file '${fn}'" >&2
exit 1
fi
done
exec xsltproc ${xsopts} ${xslwork} ${xmlfn}
# not reached.
;;
digest-to-csv)
# input: output from "digest" step.
# output: csv.
shift
#
# this awk script expects ...
#
#|
#| obj Space id 471924708
#| + created 2015-02-04 10:53:23.000
#| + changed 2019-10-17 16:18:53.662
#| + key VSS
#| + name VSS Group
#| + desc-id 471793664
#| + child-id 471793666
#|
awk '
function spc_tag ( oID, curr, _xn, _xv, _xo, i, got) {
Labels["space-path"]++
got = objVal[oID ":space-path"]
if ( got == "" ) got = curr
else if ( got != curr) got = got crlf curr
objVal[oID ":space-path"] = got
_xn = split( objVal[oID ":link-id"], _xv, "\f")
if ( _xn > 0 ) {
Labels[ "link-list" ]++
_sep = ""
for ( i = 1 ; i <= _xn ; i++ ) {
_xo = _xv[i]
objVal[oID ":link-list"] = objVal[oID ":link-list"] _sep _xo
_sep = crlf
# do not recurse for links.
objVal[_xo ":space-path"] = curr "/" oID
}
}
_xn = split( objVal[oID ":attachment-id"], _xv, "\f")
if ( _xn > 0 ) {
Labels[ "attachment-list" ]++
_sep = ""
for ( i = 1 ; i <= _xn ; i++ ) {
_xo = _xv[i]
objVal[oID ":attachment-list"] = objVal[oID ":attachment-list"] _sep _xo
_sep = crlf
# do not recurse for attachments.
objVal[_xo ":space-path"] = curr "/" oID
}
}
_xn = split( objVal[oID ":child-id"], _xv, "\f")
if ( _xn > 0 ) {
Labels[ "child-list" ]++
_sep = ""
for ( i = 1 ; i <= _xn ; i++ ) {
_xo = _xv[i]
objVal[oID ":child-list"] = objVal[oID ":child-list"] _sep _xo
_sep = crlf
spc_tag(_xo, curr "/" oID )
}
}
}
BEGIN {
dq = "\"" ; sq = sprintf("%c", 39)
crlf = "\\r\\n"
objVal[""] = ""
space_n[""] = space_o[""] = ""
# Confluence-defined space keys.
addUrl["http"] = addUrl["https"] = addUrl["mailto"] = 1
addUrl["ftp"] = addUrl["ldaps"] = addUrl["udp"] = 1
addUrl["dev"] = addUrl["file"] = 1
addUrl["hc"] = addUrl["rr"] = 1
# ... are these ones some kind of typo ...?
addUrl["/https"] = addUrl[";http"] = 1
}
NF == 0 {
if ( objID == "" ) next
objtyp = objVal[objID ":obj-type"]
# XXX - this expects that Spaces are listed first.
if ( objtyp == "Space" ) {
idnam = objVal[objID ":key"]
space_n[idnam] = objID ; space_o[objID] = idnam
objID = "" ; next
}
# XXX - this expects that BodyContents are listed last
# XXX - so that BodyContent parents are already defined.
# XXX - also assumes BodyContents only have one parent/xref.
if ( objtyp == "Body" && objVal[objID ":title"] == "" ) {
# copy title from xref-id, if available.
xr_ID = objVal[objID ":xref-id"]
if ( xr_ID != "" ) {
xr_TL = objVal[xr_ID ":title"]
if ( xr_TL != "" )
objVal[objID ":title"] = xr_TL
}
}
# build a dest-url, if parts are available.
spckey = objVal[objID ":space-key"]
if ( addUrl[spckey] > 0 ) {
lnkdst = objVal[objID ":dest-title"]
if ( lnkdst != "" ) {
lbl = "dest-url"
Labels[lbl]++
url = spckey ":" lnkdst
#delay# url = "" url ""
url = objVal[objID ":" lbl] " " url
objVal[objID ":" lbl] = url
objHas[objID] = objHas[objID] " " lbl
# avoid duplication.
objVal[objID ":space-key"] = ""
objVal[objID ":dest-title"] = ""
}
}
objID = ""
next
}
$1$3 == "objid" {
lbl = "obj-type"
objID = $NF ; val = $2
Labels[lbl]++
objHas[objID] = lbl
objVal[objID ":" lbl] = val
objGot[objID] = 0
next
}
$1$2 == "+created" || $1$2 == "+changed" {
# shrink timestamp parts.
lbl = $2 ; Labels[lbl]++
objHas[objID] = objHas[objID] " " lbl
if ( $3 $4 == "" ) next
d = substr($3, 1, 4) substr($3, 6, 2) substr($3, 9)
t = substr($4, 1, 2) substr($4, 4, 2) substr($4, 7)
objVal[objID ":" lbl] = d "-" t
next
}
$1 == "+" {
line = $0
lbl = $2 ; Labels[lbl]++
# save remainder of line "as is" ...
x = index(line, lbl) + length(lbl) + 1
val = substr(line, x)
# ... but ...
# ... escape backslashes ...
if ( index(val, "\\") > 0 ) {
gsub("\\\\", "\\\", val)
}
# ... escape dbl-quotes ...
if ( index(val, dq) > 0 ) {
gsub(dq, "\\"", val)
}
# ... escape commas to avoid CSV conflicts.
if ( index(val, ",") > 0 ) {
gsub(",", "\\,", val)
}
objHas[objID] = objHas[objID] " " lbl
ox = objID ":" lbl ; ov = objVal[ox];
if ( length(ov) > 0 ) val = ov "\f" val
objVal[ox] = val
}
END {
# label the space trees.
for ( sobj in space_o ) {
if ( sobj == "" ) continue
spcnam = "{" space_o[sobj] "}"
spc_tag(sobj, spcnam)
}
# output the CSV.
list = dq "obj-id" dq
for ( l in Labels ) list = list "," dq l dq
print list
for ( o in objHas ) {
line = dq o dq
for ( l in Labels )
line = line "," dq objVal[o ":" l] dq
#db# print list
print line
}
}' "${@}"
exit
;;
cvt-xhtml)
# convert a Confluence page to xhtml.
shift
xslwork=${libd}/cnflu-to-xhtml.xsl
dtdsrc="--path ${libd}/_GH"
xsopts="--nomkdir --nonet --nowrite"
(
cat <<__EOxml__
__EOxml__
case "$#" in 0) cat - ;; *) cat "${@}" ;; esac
echo ""
) |
xsltproc ${dtdsrc} ${xsopts} ${xslwork} - 2> /dev/null |
sed 1d
exit
;;
*)
# unknown arg.
exec ${self} usage
# not reached.
;;
esac