#!/bin/sh - # # do things with a saved Confluence backup package. # this expects backups saved from Confluence clowd (yes, "clowd"). # # $Header: /local/adm/RCS/conflu-bkup-proc,v 1.55 2021/11/16 03:10:44 pkern Exp $ PATH=/local/bin:/usr/local/bin:$PATH export PATH self="${0}" prog=`basename "${self}"` libd=./lib cmd="${1}" case "${cmd}" in usage) exec >&2 cat << __EoUsage__ usage: ${self} diff-ziplists {list_1} {list_2} ${self} dig-xml {zipfile} {zipfile-list} ${self} obj-dump {confluence-xml} ${self} xml-digest {confluence-xml} ${self} xml-unpack {confluence-xml} {bodies-dir} ${self} digest-to-csv ${self} cvt-xhtml [ {confluence-page} ] ${self} xmlfmt __EoUsage__ exit 1 ;; dig-xml) # extract XML from Confluence backup file. want=entities.xml zipf="${2}" if [ ! -s "${zipf}" ] ; then echo "${self}: not a zip file: '${zipf}'" >&2 exit 1 fi zlist="${3}" if [ ! -s "${zlist}" ] ; then echo "${self}: not a zipfile list? '${zlist}'" >&2 exit 1 fi has=`fgrep "${want}" ${zlist}` if [ -z "${has}" ] ; then echo "${self}: zip-list does not include '${want}'" >&2 exit 1 fi unzip -p ${zipf} ${want} | ${self} xmlfmt exit ;; xmlfmt) # make XML more readable. shift exec xmlindent -nas -nbe -i 2 -l 72 -f "$@" # not reached. ;; diff-ziplists) # Confluence zip files have weird datestamps. ignore them. tmp=/tmp/${prog},$$ trap '/bin/rm -f ${tmp}* ; exit' 0 1 2 3 15 f1="${2}" f2="${3}" awk '{ print $1, $3, $7, $NF }' "${f1}" > ${tmp},1 awk '{ print $1, $3, $7, $NF }' "${f2}" > ${tmp},2 diff -c0 ${tmp},[12] exit ;; obj-dump) # input: Confluence backup xml # output: simple dump of objects. tmpxsl=/tmp/${prog},$$,xsl trap "/bin/rm -f ${tmpxsl}*; exit" 0 1 2 3 15 # https://stackoverflow.com/questions/25662151/padding-number-with-leading-zeros-in-xslt-1-0 cat << __EOxsl__ > ${tmpxsl} id class space version ; timestamp title __EOxsl__ xsltproc ${tmpxsl} "${@}" exit ;; xml-digest|xml-unpack) # # process XML from Confluence backup pkg. # # digest = output a simple stream of objects, suitable for awk. # unpack = save BodyContent objects to individual files. # xslwork="" xsopts="" xmlfn="${2}" ; [ -z "${xmlfn}" ] && exec ${self} usage case "${cmd}" in *-digest) xslwork=${libd}/cnflu-bk-digest-xml.xsl ;; *-unpack) outd="${3}" ; [ -z "${outd}" ] && exec ${self} usage xslwork=${libd}/cnflu-bk-unpack-bdy.xsl if [ ! -d "${outd}" ] ; then echo "${self} : ${cmd} - no directory '${outd}'" >&2 exit 1 fi xsopts="--stringparam savedir ${outd}/" ;; esac for fn in ${xslwork} ${xmlfn} do if [ ! -s "${fn}" ] ; then echo "${self} : ${cmd} - no such file '${fn}'" >&2 exit 1 fi done exec xsltproc ${xsopts} ${xslwork} ${xmlfn} # not reached. ;; digest-to-csv) # input: output from "digest" step. # output: csv. shift # # this awk script expects ... # #| #| obj Space id 471924708 #| + created 2015-02-04 10:53:23.000 #| + changed 2019-10-17 16:18:53.662 #| + key VSS #| + name VSS Group #| + desc-id 471793664 #| + child-id 471793666 #| awk ' function spc_tag ( oID, curr, _xn, _xv, _xo, i, got) { Labels["space-path"]++ got = objVal[oID ":space-path"] if ( got == "" ) got = curr else if ( got != curr) got = got crlf curr objVal[oID ":space-path"] = got _xn = split( objVal[oID ":link-id"], _xv, "\f") if ( _xn > 0 ) { Labels[ "link-list" ]++ _sep = "" for ( i = 1 ; i <= _xn ; i++ ) { _xo = _xv[i] objVal[oID ":link-list"] = objVal[oID ":link-list"] _sep _xo _sep = crlf # do not recurse for links. objVal[_xo ":space-path"] = curr "/" oID } } _xn = split( objVal[oID ":attachment-id"], _xv, "\f") if ( _xn > 0 ) { Labels[ "attachment-list" ]++ _sep = "" for ( i = 1 ; i <= _xn ; i++ ) { _xo = _xv[i] objVal[oID ":attachment-list"] = objVal[oID ":attachment-list"] _sep _xo _sep = crlf # do not recurse for attachments. objVal[_xo ":space-path"] = curr "/" oID } } _xn = split( objVal[oID ":child-id"], _xv, "\f") if ( _xn > 0 ) { Labels[ "child-list" ]++ _sep = "" for ( i = 1 ; i <= _xn ; i++ ) { _xo = _xv[i] objVal[oID ":child-list"] = objVal[oID ":child-list"] _sep _xo _sep = crlf spc_tag(_xo, curr "/" oID ) } } } BEGIN { dq = "\"" ; sq = sprintf("%c", 39) crlf = "\\r\\n" objVal[""] = "" space_n[""] = space_o[""] = "" # Confluence-defined space keys. addUrl["http"] = addUrl["https"] = addUrl["mailto"] = 1 addUrl["ftp"] = addUrl["ldaps"] = addUrl["udp"] = 1 addUrl["dev"] = addUrl["file"] = 1 addUrl["hc"] = addUrl["rr"] = 1 # ... are these ones some kind of typo ...? addUrl["/https"] = addUrl[";http"] = 1 } NF == 0 { if ( objID == "" ) next objtyp = objVal[objID ":obj-type"] # XXX - this expects that Spaces are listed first. if ( objtyp == "Space" ) { idnam = objVal[objID ":key"] space_n[idnam] = objID ; space_o[objID] = idnam objID = "" ; next } # XXX - this expects that BodyContents are listed last # XXX - so that BodyContent parents are already defined. # XXX - also assumes BodyContents only have one parent/xref. if ( objtyp == "Body" && objVal[objID ":title"] == "" ) { # copy title from xref-id, if available. xr_ID = objVal[objID ":xref-id"] if ( xr_ID != "" ) { xr_TL = objVal[xr_ID ":title"] if ( xr_TL != "" ) objVal[objID ":title"] = xr_TL } } # build a dest-url, if parts are available. spckey = objVal[objID ":space-key"] if ( addUrl[spckey] > 0 ) { lnkdst = objVal[objID ":dest-title"] if ( lnkdst != "" ) { lbl = "dest-url" Labels[lbl]++ url = spckey ":" lnkdst #delay# url = "" url "" url = objVal[objID ":" lbl] " " url objVal[objID ":" lbl] = url objHas[objID] = objHas[objID] " " lbl # avoid duplication. objVal[objID ":space-key"] = "" objVal[objID ":dest-title"] = "" } } objID = "" next } $1$3 == "objid" { lbl = "obj-type" objID = $NF ; val = $2 Labels[lbl]++ objHas[objID] = lbl objVal[objID ":" lbl] = val objGot[objID] = 0 next } $1$2 == "+created" || $1$2 == "+changed" { # shrink timestamp parts. lbl = $2 ; Labels[lbl]++ objHas[objID] = objHas[objID] " " lbl if ( $3 $4 == "" ) next d = substr($3, 1, 4) substr($3, 6, 2) substr($3, 9) t = substr($4, 1, 2) substr($4, 4, 2) substr($4, 7) objVal[objID ":" lbl] = d "-" t next } $1 == "+" { line = $0 lbl = $2 ; Labels[lbl]++ # save remainder of line "as is" ... x = index(line, lbl) + length(lbl) + 1 val = substr(line, x) # ... but ... # ... escape backslashes ... if ( index(val, "\\") > 0 ) { gsub("\\\\", "\\\", val) } # ... escape dbl-quotes ... if ( index(val, dq) > 0 ) { gsub(dq, "\\"", val) } # ... escape commas to avoid CSV conflicts. if ( index(val, ",") > 0 ) { gsub(",", "\\,", val) } objHas[objID] = objHas[objID] " " lbl ox = objID ":" lbl ; ov = objVal[ox]; if ( length(ov) > 0 ) val = ov "\f" val objVal[ox] = val } END { # label the space trees. for ( sobj in space_o ) { if ( sobj == "" ) continue spcnam = "{" space_o[sobj] "}" spc_tag(sobj, spcnam) } # output the CSV. list = dq "obj-id" dq for ( l in Labels ) list = list "," dq l dq print list for ( o in objHas ) { line = dq o dq for ( l in Labels ) line = line "," dq objVal[o ":" l] dq #db# print list print line } }' "${@}" exit ;; cvt-xhtml) # convert a Confluence page to xhtml. shift xslwork=${libd}/cnflu-to-xhtml.xsl dtdsrc="--path ${libd}/_GH" xsopts="--nomkdir --nonet --nowrite" ( cat <<__EOxml__ __EOxml__ case "$#" in 0) cat - ;; *) cat "${@}" ;; esac echo "" ) | xsltproc ${dtdsrc} ${xsopts} ${xslwork} - 2> /dev/null | sed 1d exit ;; *) # unknown arg. exec ${self} usage # not reached. ;; esac