Ingesting Dynamically Generated Content from Confluence
Scrape content dynamically generated by Confluence Macros
Last updated
Was this helpful?
Scrape content dynamically generated by Confluence Macros
Confluence offers Macros or add-ons to enhance functionality and display additional content. This content is not ingested by the standard confluence connector. To ingest content generated by Macros you will need to use a Custom Script with a custom field mapping. The process is outlined below.
Use the following script will be used to ingest all content from the XML Schema of your Confluence pages.
On line 9 you will need to replace <INSTANCE_ID> with your Confluence instance. This can be found by navigating to your Confluence and taking note of the subdomain.
In the Aisera Admin UI navigate to Settings > Data Source.
Select the Confluence Data Source you want to activate to enable ingestion of dynamically generated content.
On the Data Source Details page, click the Pencil icon in the top right of the screen to edit the data source.
Select Configurations.
Modify the script above to replace the <INSTANCE ID> in line 9 with your Confluence instance ID.
Insert the modified script into the Custom Script field.

Add content.body.storage to the Additional Fields to Expand field.
Click OK to save the new configurations.
At the bottom of the Data Source Details page, click the New Field Mapping button.
Select Body for the Field, and enter body.storage.value for the Confluence Field.
Click OK to save the new field mapping.

Last updated
Was this helpful?
Was this helpful?
var transform = function (jsobject) {
var Jsoup = Java.type('org.jsoup.Jsoup');
// Get ID and storage body
var id = jsobject["id"];
var storageXml = jsobject["body"]["storage"]["value"];
// Dynamic attachment base URL
var BASE_ATTACHMENT_URL = "https://<INSTANCE_ID>.atlassian.net/wiki/download/attachments/" + id + "/";
// Parse with default HTML parser
var doc = Jsoup.parse(storageXml);
// ---------------------------------------------------------------------
// 1) HANDLE ALL ac:structured-macro WITH ac:plain-text-body (CDATA)
// - code / code-block -> <pre><code>...</code></pre>
// - hmpcost-html-macro-pro -> treat CDATA as HTML
// - everything else with plain-text-body -> <pre><code>...</code></pre>
// ---------------------------------------------------------------------
var macros = doc.getElementsByTag("ac:structured-macro");
for (var i = macros.size() - 1; i >= 0; i--) {
var macro = macros.get(i);
var name = macro.attr("ac:name");
var plainBody = macro.getElementsByTag("ac:plain-text-body").first();
if (plainBody == null) {
// macro with no plain-text-body; maybe rich-text-body only – handle later
continue;
}
var cdata = plainBody.wholeText();
if (cdata == null) {
cdata = "";
}
// 1A) Standard code macros
if (name === "code" || name === "code-block") {
var preCode = doc.createElement("pre");
var codeNode = doc.createElement("code");
codeNode.text(cdata); // escaped as text
preCode.appendChild(codeNode);
macro.replaceWith(preCode);
continue;
}
// 1B) HTML macro: interpret CDATA as HTML (iframe, headings, etc.)
if (name === "hmpcost-html-macro-pro") {
var wrapper = doc.createElement("div");
wrapper.html(cdata); // parse CDATA as HTML
macro.replaceWith(wrapper);
continue;
}
// 1C) Any other macro with plain-text-body → show as code by default
var preOther = doc.createElement("pre");
var codeOther = doc.createElement("code");
codeOther.text(cdata);
preOther.appendChild(codeOther);
macro.replaceWith(preOther);
}
// ---------------------------------------------------------------------
// 2) ANY ac:plain-text-body OUTSIDE MACROS → <pre><code>...</code></pre>
// ---------------------------------------------------------------------
var plainBodies = doc.getElementsByTag("ac:plain-text-body");
for (var j = plainBodies.size() - 1; j >= 0; j--) {
var body = plainBodies.get(j);
// check if still inside a macro
var parent = body.parent();
var inMacro = false;
while (parent != null) {
if (parent.tagName() === "ac:structured-macro") {
inMacro = true;
break;
}
parent = parent.parent();
}
if (inMacro) {
continue;
}
var rawText = body.wholeText();
if (rawText == null || rawText.trim().isEmpty()) {
continue;
}
var pre2 = doc.createElement("pre");
var code2 = doc.createElement("code");
code2.text(rawText);
pre2.appendChild(code2);
body.replaceWith(pre2);
}
// ---------------------------------------------------------------------
// 3) OTHER MACROS (no plain-text-body): unwrap rich-text body, remove others
// ---------------------------------------------------------------------
macros = doc.getElementsByTag("ac:structured-macro");
for (var k = macros.size() - 1; k >= 0; k--) {
var macro2 = macros.get(k);
var richBody = macro2.getElementsByTag("ac:rich-text-body").first();
if (richBody != null) {
macro2.replaceWith(richBody);
} else {
macro2.remove();
}
}
// ---------------------------------------------------------------------
// 4) LINKS: <ac:link> → <a>
// ---------------------------------------------------------------------
var acLinks = doc.getElementsByTag("ac:link");
for (var l = acLinks.size() - 1; l >= 0; l--) {
var acLink = acLinks.get(l);
var href = null;
var text = acLink.text();
// <ac:link><ri:page ri:content-title="My page"/></ac:link>
var pageRef = acLink.getElementsByTag("ri:page").first();
if (pageRef != null) {
var title = pageRef.attr("ri:content-title");
if (title != null && !title.isEmpty()) {
var encodedTitle = String(title).replace(/ /g, "%20");
href = "/wiki/pages/view?title=" + encodedTitle;
}
}
// <ac:link><ri:attachment ri:filename="file name.png"/></ac:link>
var attachmentRef = acLink.getElementsByTag("ri:attachment").first();
if (attachmentRef != null) {
var filename = attachmentRef.attr("ri:filename");
if (filename != null && !filename.isEmpty()) {
var encodedFilename = String(filename).replace(/ /g, "%20");
href = BASE_ATTACHMENT_URL + encodedFilename;
}
}
// <ac:link><ri:url ri:value="https://..."/></ac:link>
var urlRef = acLink.getElementsByTag("ri:url").first();
if (urlRef != null) {
var value = urlRef.attr("ri:value");
if (value != null && !value.isEmpty()) {
href = value;
}
}
var a = doc.createElement("a");
if (href != null && !href.isEmpty()) {
a.attr("href", href);
}
if (text != null && !text.isEmpty()) {
a.text(text);
} else if (href != null) {
a.text(href);
}
acLink.replaceWith(a);
}
// ---------------------------------------------------------------------
// 5) IMAGES: <ac:image> → <img>
// ---------------------------------------------------------------------
var acImages = doc.getElementsByTag("ac:image");
for (var m = acImages.size() - 1; m >= 0; m--) {
var acImg = acImages.get(m);
var width = acImg.attr("ac:width");
var height = null;
// If user resized -> only width matters
if (width == null || width.isEmpty()) {
width = acImg.attr("ac:original-width");
height = acImg.attr("ac:original-height");
}
var attachment = acImg.getElementsByTag("ri:attachment").first();
if (attachment == null) {
acImg.remove();
continue;
}
var fileNameRaw = attachment.attr("ri:filename");
if (fileNameRaw == null || fileNameRaw.isEmpty()) {
acImg.remove();
continue;
}
// Simple encoding: space → %20
var encodedFileName = String(fileNameRaw).replace(/ /g, "%20");
var src = BASE_ATTACHMENT_URL + encodedFileName;
var img = doc.createElement("img");
img.attr("src", src);
// alignment: ac:align="center|left|right"
var align = acImg.attr("ac:align");
if (align != null && !align.isEmpty()) {
var low = align.toLowerCase();
if (low === "center") {
img.attr("style", "display:block;margin-left:auto;margin-right:auto;");
} else if (low === "right") {
img.attr("style", "float:right;");
} else if (low === "left") {
img.attr("style", "float:left;");
}
}
if (width != null && !width.isEmpty()) {
img.attr("width", width.replace(/["\\]/g, ""));
}
if (height != null && !height.isEmpty()) {
img.attr("height", height.replace(/["\\]/g, ""));
}
acImg.replaceWith(img);
}
// ---------------------------------------------------------------------
// 6) EMOTICONS: <ac:emoticon ... ac:emoji-fallback="😛" /> → 😛
// ---------------------------------------------------------------------
var emoticons = doc.getElementsByTag("ac:emoticon");
for (var e = emoticons.size() - 1; e >= 0; e--) {
var emo = emoticons.get(e);
var emoji = emo.attr("ac:emoji-fallback");
if (!emoji || emoji.isEmpty()) {
emoji = emo.attr("ac:emoji-shortname"); // e.g. :stuck_out_tongue:
}
if (!emoji || emoji.isEmpty()) {
emoji = ":" + emo.attr("ac:name") + ":"; // e.g. :cheeky:
}
// Turn <ac:emoticon .../> into a simple <span>😛</span>
emo.tagName("span");
emo.text(emoji);
// (We could strip attributes, but it's optional for rendering)
}
// ---------------------------------------------------------------------
// 7) CLEANUP: unwrap rich-text, remove params, unwrap leftover plain-text
// ---------------------------------------------------------------------
var bodies = doc.getElementsByTag("ac:rich-text-body");
for (var n = bodies.size() - 1; n >= 0; n--) {
bodies.get(n).unwrap();
}
var params = doc.getElementsByTag("ac:parameter");
for (var p = params.size() - 1; p >= 0; p--) {
params.get(p).remove();
}
plainBodies = doc.getElementsByTag("ac:plain-text-body");
for (var q = plainBodies.size() - 1; q >= 0; q--) {
plainBodies.get(q).unwrap();
}
// Write back
jsobject["body"]["storage"]["value"] = doc.outerHtml();
return jsobject;
};
