#!/bin/sh -
#
# do things with a saved Confluence backup package.
# this expects backups saved from Confluence clowd (yes, "clowd").
#
# $Header: /local/adm/RCS/conflu-bkup-proc,v 1.55 2021/11/16 03:10:44 pkern Exp $

PATH=/local/bin:/usr/local/bin:$PATH
export PATH

self="${0}"
prog=`basename "${self}"`

libd=./lib

cmd="${1}"

case "${cmd}" in
usage)
	exec >&2
	cat << __EoUsage__

usage:

  ${self}  diff-ziplists {list_1} {list_2}

  ${self}  dig-xml {zipfile} {zipfile-list}

  ${self}  obj-dump   {confluence-xml}

  ${self}  xml-digest {confluence-xml}
  ${self}  xml-unpack {confluence-xml} {bodies-dir}

  ${self}  digest-to-csv

  ${self}  cvt-xhtml [ {confluence-page} ]

  ${self}  xmlfmt
	
__EoUsage__

	exit 1
	;;

dig-xml)
	# extract XML from Confluence backup file.

	want=entities.xml

	zipf="${2}"
	if [ ! -s "${zipf}" ] ; then
		echo "${self}: not a zip file: '${zipf}'" >&2
		exit 1
	fi

	zlist="${3}"
	if [ ! -s "${zlist}" ] ; then
		echo "${self}: not a zipfile list? '${zlist}'" >&2
		exit 1
	fi

	has=`fgrep "${want}" ${zlist}`

	if [ -z "${has}" ] ; then
		echo "${self}: zip-list does not include '${want}'" >&2
		exit 1
	fi

	unzip -p ${zipf} ${want} |
		${self} xmlfmt

	exit
	;;

xmlfmt)
	# make XML more readable.
	shift

	exec xmlindent -nas -nbe -i 2 -l 72 -f "$@"
	# not reached.
	;;

diff-ziplists)
	# Confluence zip files have weird datestamps. ignore them.

	tmp=/tmp/${prog},$$
	trap '/bin/rm -f ${tmp}* ; exit' 0 1 2 3 15
	
	f1="${2}" f2="${3}"

	awk '{ print $1, $3, $7, $NF }' "${f1}" > ${tmp},1
	awk '{ print $1, $3, $7, $NF }' "${f2}" > ${tmp},2

	diff -c0 ${tmp},[12]

	exit
	;;

obj-dump)
        # input: Confluence backup xml
        # output: simple dump of objects.

	tmpxsl=/tmp/${prog},$$,xsl
	trap "/bin/rm -f ${tmpxsl}*; exit" 0 1 2 3 15

# https://stackoverflow.com/questions/25662151/padding-number-with-leading-zeros-in-xslt-1-0
cat << __EOxsl__ > ${tmpxsl}
<?xml version="1.0" encoding="UTF-8"?>

<xsl:stylesheet version="1.0" 
  xmlns:xsl="http://www.w3.org/1999/XSL/Transform" 
  >
  <xsl:output method="text"/>

  <xsl:template match="/">
    <xsl:for-each select="hibernate-generic/object">
id <xsl:value-of select="id"/> class <xsl:value-of select="@class"/> space <xsl:value-of select="normalize-space(property[@name='space'])"/> version <xsl:value-of select="format-number(property[@name='version'], '0000')"/> ;
	timestamp <xsl:value-of select="property[@name='lastModificationDate']"/>
	title <xsl:value-of select="property[@name='title']"/>
    </xsl:for-each>
<xsl:text>
</xsl:text>
  </xsl:template>

</xsl:stylesheet>
__EOxsl__

	xsltproc ${tmpxsl} "${@}"

	exit
	;;

xml-digest|xml-unpack)
	#
	# process XML from Confluence backup pkg.
	#
	#   digest = output a simple stream of objects, suitable for awk.
        #   unpack = save BodyContent objects to individual files.
	#

	xslwork=""
	xsopts=""

	xmlfn="${2}" ; [ -z "${xmlfn}" ] && exec ${self} usage

	case "${cmd}" in
	*-digest)
		xslwork=${libd}/cnflu-bk-digest-xml.xsl
		;;
	*-unpack)
		outd="${3}" ; [ -z "${outd}" ] && exec ${self} usage

		xslwork=${libd}/cnflu-bk-unpack-bdy.xsl

		if [ ! -d "${outd}" ] ; then
			echo "${self} : ${cmd} - no directory '${outd}'" >&2
			exit 1
		fi

		xsopts="--stringparam savedir ${outd}/"
		;;
	esac

	for fn in ${xslwork} ${xmlfn}
	do
		if [ ! -s "${fn}" ] ; then
			echo "${self} : ${cmd} - no such file '${fn}'" >&2
			exit 1
		fi
	done

	exec xsltproc ${xsopts} ${xslwork} ${xmlfn}
	# not reached.
	;;

digest-to-csv)
	# input: output from "digest" step.
	# output: csv.

	shift

#
# this awk script expects ...
#
#|
#| obj Space id 471924708
#|    +    created 2015-02-04 10:53:23.000
#|    +    changed 2019-10-17 16:18:53.662
#|    +    key VSS
#|    +    name VSS Group
#|    +    desc-id 471793664
#|    +    child-id 471793666
#|

	awk '
function spc_tag ( oID, curr, _xn, _xv, _xo, i, got) {
	Labels["space-path"]++

	got = objVal[oID ":space-path"]
	if ( got == "" )	got = curr
	else if ( got != curr)	got = got crlf curr
	objVal[oID ":space-path"] = got

	_xn = split( objVal[oID ":link-id"], _xv, "\f")
	if ( _xn > 0 ) {
		Labels[ "link-list" ]++
		_sep = ""
		for ( i = 1 ; i <= _xn ; i++ ) {
			_xo = _xv[i]
			objVal[oID ":link-list"] = objVal[oID ":link-list"] _sep _xo
			_sep = crlf

			# do not recurse for links.
			objVal[_xo ":space-path"] = curr "/" oID
		}
	}

	_xn = split( objVal[oID ":attachment-id"], _xv, "\f")
	if ( _xn > 0 ) {
		Labels[ "attachment-list" ]++
		_sep = ""
		for ( i = 1 ; i <= _xn ; i++ ) {
			_xo = _xv[i]
			objVal[oID ":attachment-list"] = objVal[oID ":attachment-list"] _sep _xo
			_sep = crlf

			# do not recurse for attachments.
			objVal[_xo ":space-path"] = curr "/" oID
		}
	}

	_xn = split( objVal[oID ":child-id"], _xv, "\f")
	if ( _xn > 0 ) {
		Labels[ "child-list" ]++
		_sep = ""
		for ( i = 1 ; i <= _xn ; i++ ) {
			_xo = _xv[i]
			objVal[oID ":child-list"] = objVal[oID ":child-list"] _sep _xo
			_sep = crlf

			spc_tag(_xo, curr "/" oID )
		}
	}

}
BEGIN {
	dq = "\"" ; sq = sprintf("%c", 39)
	crlf = "\\r\\n"
	objVal[""] = ""
	space_n[""] = space_o[""] = ""

	# Confluence-defined space keys.
	addUrl["http"] = addUrl["https"] = addUrl["mailto"] = 1
	addUrl["ftp"] = addUrl["ldaps"] = addUrl["udp"] = 1
	addUrl["dev"] = addUrl["file"] = 1
	addUrl["hc"] = addUrl["rr"] = 1

	# ... are these ones some kind of typo ...?
	addUrl["/https"] = addUrl[";http"] = 1
}
NF == 0 {
	if ( objID == "" ) next

	objtyp = objVal[objID ":obj-type"]

	# XXX - this expects that Spaces are listed first.
	if ( objtyp == "Space" ) {
		idnam = objVal[objID ":key"]
		space_n[idnam] = objID ; space_o[objID] = idnam
		objID = "" ; next
	}

	# XXX - this expects that BodyContents are listed last
	# XXX - so that BodyContent parents are already defined.
	# XXX - also assumes BodyContents only have one parent/xref.
	if ( objtyp == "Body" && objVal[objID ":title"] == "" ) {
		# copy title from xref-id, if available.
		xr_ID = objVal[objID ":xref-id"]
		if ( xr_ID != "" ) {
			xr_TL = objVal[xr_ID ":title"]
			if ( xr_TL != "" )
				objVal[objID ":title"] = xr_TL
		}
	}

	# build a dest-url, if parts are available.
	spckey = objVal[objID ":space-key"]
	if ( addUrl[spckey] > 0 ) {
		lnkdst = objVal[objID ":dest-title"]
		if ( lnkdst != "" ) {
			lbl = "dest-url"
			Labels[lbl]++
			url = spckey ":" lnkdst
#delay#			url = "<a href=" sq url sq ">" url "</a>"
			url = objVal[objID ":" lbl] " " url
			objVal[objID ":" lbl] = url
			objHas[objID] = objHas[objID] " " lbl

			# avoid duplication.
			objVal[objID ":space-key"] = ""
			objVal[objID ":dest-title"] = ""
		}
	}

	objID = ""
	next
}
$1$3 == "objid" {
	lbl = "obj-type"
	objID = $NF ; val = $2
	Labels[lbl]++
	objHas[objID] = lbl
	objVal[objID ":" lbl] = val
	objGot[objID] = 0
	next
}
$1$2 == "+created" || $1$2 == "+changed" {
	# shrink timestamp parts.
	lbl = $2 ; Labels[lbl]++
	objHas[objID] = objHas[objID] " " lbl

	if ( $3 $4 == "" ) next

	d = substr($3, 1, 4) substr($3, 6, 2) substr($3, 9)
	t = substr($4, 1, 2) substr($4, 4, 2) substr($4, 7)

	objVal[objID ":" lbl] = d "-" t
	next
}
$1 == "+" {
	line = $0
	lbl = $2 ; Labels[lbl]++
	# save remainder of line "as is" ...
	x = index(line, lbl) + length(lbl) + 1
	val = substr(line, x)

	# ... but ...

	# ... escape backslashes ...
	if ( index(val, "\\") > 0 ) {
		gsub("\\\\", "\\&bsol;", val)
	}

	# ... escape dbl-quotes ...
	if ( index(val, dq) > 0 ) {
		gsub(dq, "\\&quot;", val)
	}

	# ... escape commas to avoid CSV conflicts.
	if ( index(val, ",") > 0 ) {
		gsub(",", "\\&comma;", val)
	}

	objHas[objID] = objHas[objID] " " lbl

	ox = objID ":" lbl ; ov = objVal[ox];
	if ( length(ov) > 0 ) val = ov "\f" val
	objVal[ox] = val
}
END {
	# label the space trees.
	for ( sobj in space_o ) {
		if ( sobj == "" ) continue
		spcnam = "{" space_o[sobj] "}"
		spc_tag(sobj, spcnam)
	}

	# output the CSV.
	list = dq "obj-id" dq
	for ( l in Labels ) list = list "," dq l dq
		
	print list

	for ( o in objHas ) {
		line = dq o dq
		for ( l in Labels )
			line = line "," dq objVal[o ":" l] dq

#db#		print list
		print line
	}
}' "${@}"

	exit
	;;

cvt-xhtml)
	# convert a Confluence page to xhtml.
	shift

	xslwork=${libd}/cnflu-to-xhtml.xsl

	dtdsrc="--path ${libd}/_GH"

	xsopts="--nomkdir --nonet --nowrite"

(
	cat <<__EOxml__
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE ac:confluence SYSTEM "confluence.dtd">
<ac:confluence xmlns:ac="http://www.atlassian.com/schema/confluence/4/ac/" xmlns:ri="http://www.atlassian.com/schema/confluence/4/ri/" xmlns="http://www.atlassian.com/schema/confluence/4/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.atlassian.com/schema/confluence/4/ac/ confluence.xsd">
__EOxml__

	case "$#" in 0) cat - ;; *) cat "${@}" ;; esac

	echo "</ac:confluence>"
) |
	xsltproc ${dtdsrc} ${xsopts} ${xslwork} - 2> /dev/null |
	  sed 1d

	exit
	;;

*)
	# unknown arg.

	exec ${self} usage
	# not reached.
	;;
esac
