Ingesting Dynamically Generated Content from Confluence

Scrape content dynamically generated by Confluence Macros

Dynamically Generated Content

Confluence offers Macros or add-ons to enhance functionality and display additional content. This content is not ingested by the standard confluence connector. To ingest content generated by Macros you will need to use a Custom Script with a custom field mapping. The process is outlined below.

Script

Use the following script will be used to ingest all content from the XML Schema of your Confluence pages.

On line 9 you will need to replace <INSTANCE_ID> with your Confluence instance. This can be found by navigating to your Confluence and taking note of the subdomain.

var transform = function (jsobject) {
    var Jsoup = Java.type('org.jsoup.Jsoup');

    // Get ID and storage body
    var id         = jsobject["id"];
    var storageXml = jsobject["body"]["storage"]["value"];

    // Dynamic attachment base URL
    var BASE_ATTACHMENT_URL = "https://<INSTANCE_ID>.atlassian.net/wiki/download/attachments/" + id + "/";

    // Parse with default HTML parser
    var doc = Jsoup.parse(storageXml);

    // ---------------------------------------------------------------------
    // 1) HANDLE ALL ac:structured-macro WITH ac:plain-text-body (CDATA)
    //    - code / code-block             -> <pre><code>...</code></pre>
    //    - hmpcost-html-macro-pro        -> treat CDATA as HTML
    //    - everything else with plain-text-body -> <pre><code>...</code></pre>
    // ---------------------------------------------------------------------
    var macros = doc.getElementsByTag("ac:structured-macro");
    for (var i = macros.size() - 1; i >= 0; i--) {
        var macro = macros.get(i);
        var name  = macro.attr("ac:name");

        var plainBody = macro.getElementsByTag("ac:plain-text-body").first();
        if (plainBody == null) {
            // macro with no plain-text-body; maybe rich-text-body only – handle later
            continue;
        }

        var cdata = plainBody.wholeText();
        if (cdata == null) {
            cdata = "";
        }

        // 1A) Standard code macros
        if (name === "code" || name === "code-block") {
            var preCode  = doc.createElement("pre");
            var codeNode = doc.createElement("code");
            codeNode.text(cdata);      // escaped as text
            preCode.appendChild(codeNode);
            macro.replaceWith(preCode);
            continue;
        }

        // 1B) HTML macro: interpret CDATA as HTML (iframe, headings, etc.)
        if (name === "hmpcost-html-macro-pro") {
            var wrapper = doc.createElement("div");
            wrapper.html(cdata);       // parse CDATA as HTML
            macro.replaceWith(wrapper);
            continue;
        }

        // 1C) Any other macro with plain-text-body → show as code by default
        var preOther  = doc.createElement("pre");
        var codeOther = doc.createElement("code");
        codeOther.text(cdata);
        preOther.appendChild(codeOther);
        macro.replaceWith(preOther);
    }

    // ---------------------------------------------------------------------
    // 2) ANY ac:plain-text-body OUTSIDE MACROS → <pre><code>...</code></pre>
    // ---------------------------------------------------------------------
    var plainBodies = doc.getElementsByTag("ac:plain-text-body");
    for (var j = plainBodies.size() - 1; j >= 0; j--) {
        var body = plainBodies.get(j);

        // check if still inside a macro
        var parent = body.parent();
        var inMacro = false;
        while (parent != null) {
            if (parent.tagName() === "ac:structured-macro") {
                inMacro = true;
                break;
            }
            parent = parent.parent();
        }
        if (inMacro) {
            continue;
        }

        var rawText = body.wholeText();
        if (rawText == null || rawText.trim().isEmpty()) {
            continue;
        }

        var pre2  = doc.createElement("pre");
        var code2 = doc.createElement("code");
        code2.text(rawText);
        pre2.appendChild(code2);

        body.replaceWith(pre2);
    }

    // ---------------------------------------------------------------------
    // 3) OTHER MACROS (no plain-text-body): unwrap rich-text body, remove others
    // ---------------------------------------------------------------------
    macros = doc.getElementsByTag("ac:structured-macro");
    for (var k = macros.size() - 1; k >= 0; k--) {
        var macro2 = macros.get(k);
        var richBody = macro2.getElementsByTag("ac:rich-text-body").first();
        if (richBody != null) {
            macro2.replaceWith(richBody);
        } else {
            macro2.remove();
        }
    }

    // ---------------------------------------------------------------------
    // 4) LINKS: <ac:link> → <a>
    // ---------------------------------------------------------------------
    var acLinks = doc.getElementsByTag("ac:link");
    for (var l = acLinks.size() - 1; l >= 0; l--) {
        var acLink = acLinks.get(l);

        var href = null;
        var text = acLink.text();

        // <ac:link><ri:page ri:content-title="My page"/></ac:link>
        var pageRef = acLink.getElementsByTag("ri:page").first();
        if (pageRef != null) {
            var title = pageRef.attr("ri:content-title");
            if (title != null && !title.isEmpty()) {
                var encodedTitle = String(title).replace(/ /g, "%20");
                href = "/wiki/pages/view?title=" + encodedTitle;
            }
        }

        // <ac:link><ri:attachment ri:filename="file name.png"/></ac:link>
        var attachmentRef = acLink.getElementsByTag("ri:attachment").first();
        if (attachmentRef != null) {
            var filename = attachmentRef.attr("ri:filename");
            if (filename != null && !filename.isEmpty()) {
                var encodedFilename = String(filename).replace(/ /g, "%20");
                href = BASE_ATTACHMENT_URL + encodedFilename;
            }
        }

        // <ac:link><ri:url ri:value="https://..."/></ac:link>
        var urlRef = acLink.getElementsByTag("ri:url").first();
        if (urlRef != null) {
            var value = urlRef.attr("ri:value");
            if (value != null && !value.isEmpty()) {
                href = value;
            }
        }

        var a = doc.createElement("a");
        if (href != null && !href.isEmpty()) {
            a.attr("href", href);
        }
        if (text != null && !text.isEmpty()) {
            a.text(text);
        } else if (href != null) {
            a.text(href);
        }

        acLink.replaceWith(a);
    }

    // ---------------------------------------------------------------------
    // 5) IMAGES: <ac:image> → <img>
    // ---------------------------------------------------------------------
    var acImages = doc.getElementsByTag("ac:image");
    for (var m = acImages.size() - 1; m >= 0; m--) {
        var acImg = acImages.get(m);

        var width  = acImg.attr("ac:width");
        var height = null;

        // If user resized -> only width matters
        if (width == null || width.isEmpty()) {
            width  = acImg.attr("ac:original-width");
            height = acImg.attr("ac:original-height");
        }

        var attachment = acImg.getElementsByTag("ri:attachment").first();
        if (attachment == null) {
            acImg.remove();
            continue;
        }

        var fileNameRaw = attachment.attr("ri:filename");
        if (fileNameRaw == null || fileNameRaw.isEmpty()) {
            acImg.remove();
            continue;
        }

        // Simple encoding: space → %20
        var encodedFileName = String(fileNameRaw).replace(/ /g, "%20");
        var src = BASE_ATTACHMENT_URL + encodedFileName;

        var img = doc.createElement("img");
        img.attr("src", src);

        // alignment: ac:align="center|left|right"
        var align = acImg.attr("ac:align");
        if (align != null && !align.isEmpty()) {
            var low = align.toLowerCase();
            if (low === "center") {
                img.attr("style", "display:block;margin-left:auto;margin-right:auto;");
            } else if (low === "right") {
                img.attr("style", "float:right;");
            } else if (low === "left") {
                img.attr("style", "float:left;");
            }
        }

        if (width != null && !width.isEmpty()) {
            img.attr("width", width.replace(/["\\]/g, ""));
        }
        if (height != null && !height.isEmpty()) {
            img.attr("height", height.replace(/["\\]/g, ""));
        }

        acImg.replaceWith(img);
    }

    // ---------------------------------------------------------------------
    // 6) EMOTICONS: <ac:emoticon ... ac:emoji-fallback="😛" /> → 😛
    // ---------------------------------------------------------------------
    var emoticons = doc.getElementsByTag("ac:emoticon");
    for (var e = emoticons.size() - 1; e >= 0; e--) {
        var emo = emoticons.get(e);

        var emoji = emo.attr("ac:emoji-fallback");
        if (!emoji || emoji.isEmpty()) {
            emoji = emo.attr("ac:emoji-shortname"); // e.g. :stuck_out_tongue:
        }
        if (!emoji || emoji.isEmpty()) {
            emoji = ":" + emo.attr("ac:name") + ":"; // e.g. :cheeky:
        }

        // Turn <ac:emoticon .../> into a simple <span>😛</span>
        emo.tagName("span");
        emo.text(emoji);
        // (We could strip attributes, but it's optional for rendering)
    }

    // ---------------------------------------------------------------------
    // 7) CLEANUP: unwrap rich-text, remove params, unwrap leftover plain-text
    // ---------------------------------------------------------------------
    var bodies = doc.getElementsByTag("ac:rich-text-body");
    for (var n = bodies.size() - 1; n >= 0; n--) {
        bodies.get(n).unwrap();
    }

    var params = doc.getElementsByTag("ac:parameter");
    for (var p = params.size() - 1; p >= 0; p--) {
        params.get(p).remove();
    }

    plainBodies = doc.getElementsByTag("ac:plain-text-body");
    for (var q = plainBodies.size() - 1; q >= 0; q--) {
        plainBodies.get(q).unwrap();
    }

    // Write back
    jsobject["body"]["storage"]["value"] = doc.outerHtml();
    return jsobject;
};

Configure the Data Source

In the Aisera Admin UI navigate to Settings > Data Source.
Select the Confluence Data Source you want to activate to enable ingestion of dynamically generated content.
On the Data Source Details page, click the Pencil icon in the top right of the screen to edit the data source.
Select Configurations.
Modify the script above to replace the <INSTANCE ID> in line 9 with your Confluence instance ID.
Insert the modified script into the Custom Script field.
Add content.body.storage to the Additional Fields to Expand field.
Click OK to save the new configurations.
At the bottom of the Data Source Details page, click the New Field Mapping button.
Select Body for the Field, and enter body.storage.value for the Confluence Field.
Click OK to save the new field mapping.

PreviousConfluence Connector NextSetting Up Cribl with Aisera

Last updated 2 months ago

Was this helpful?

hashtagDynamically Generated Content

hashtagScript

hashtagConfigure the Data Source

Dynamically Generated Content

Script

Configure the Data Source