Adding labels to Confluence pages is a simple yet often overlooked task. While labels can play a vital role in organizing content, creating groups, and improving searchability, many people either forget or don’t prioritize it. However, automating the labeling process can significantly improve efficiency and content management, particularly when managing a large number of pages.
In Confluence, labels can serve as an excellent strategy for categorizing similar content or highlighting key topics. If you’re dealing with hundreds of pages, remembering to add relevant labels manually can be quite challenging. Fortunately, leveraging AI tools, such as OpenAI, and a bit of automation, can simplify this task by analyzing page content and suggesting appropriate labels automatically.
The following Python script automates the entire process. It fetches Confluence page content, analyzes it using OpenAI, and applies relevant labels based on a predefined list. This can save you a lot of time, especially when handling large Confluence spaces.
Below is the script that performs this task. It integrates Confluence’s API with OpenAI, automatically extracting page content and applying the labels:
import os
import requests
import logging
from requests.auth import HTTPBasicAuth
from openai import OpenAI
from bs4 import BeautifulSoup
# Configuration settings
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
CONFLUENCE_BASE_URL = "https://domain.atlassian.net/wiki/rest/api"
CONFLUENCE_USERNAME = "rodolfobortolin@gmail.com"
CONFLUENCE_API_TOKEN = ""
OPENAI_API_KEY = "sk-"
# Get current script location for file operations
script_location = os.path.dirname(os.path.abspath(__file__))
def get_confluence_space_pages(space_key):
"""Retrieve all pages from a specific Confluence space."""
url = f"{CONFLUENCE_BASE_URL}/content?spaceKey={space_key}&expand=body.storage,version"
response = requests.get(
url,
auth=HTTPBasicAuth(CONFLUENCE_USERNAME, CONFLUENCE_API_TOKEN),
headers={"Accept": "application/json"}
)
if response.status_code == 200:
logging.info(f"Successfully retrieved pages for space key {space_key}")
return response.json()['results']
else:
logging.error(f"Failed to retrieve pages for space key {space_key}, Status Code: {response.status_code}")
return []
def extract_text_from_html(html_content):
"""Extract plain text from the Confluence page's HTML content."""
soup = BeautifulSoup(html_content, 'html.parser')
return soup.get_text(separator=' ', strip=True)
def generate_openai_prompt(page_title, page_text, labels_list):
"""Generate a prompt for OpenAI API to classify content based on provided labels."""
return (f"Analyze the following Confluence page content and determine the main topics or themes present. "
f"Only consider the following labels: {', '.join(labels_list)}. "
f"Please respond only with comma-separated labels, and nothing else.\n"
f"Title: {page_title}\n\n"
f"Content:\n{page_text}")
def get_openai_labels(page_title, page_text, labels_list):
"""Send content to OpenAI and retrieve suggested labels based on a provided list."""
prompt = generate_openai_prompt(page_title, page_text, labels_list)
client = OpenAI(api_key=OPENAI_API_KEY)
prompt = (f"Here is the Confluence page content: \n\n"
f"Title: {page_title}\n\n"
f"Content: \n{page_text}")
try:
response = client.chat.completions.create(
model="gpt-4o",
messages = [
{"role": "system", "content": "You are an assistant that will classify content from Confluence pages."},
{"role": "user", "content": f"Analyze the content of a Confluence page and assign labels. Only use the given list of labels, and respond with a comma-separated list of labels, and nothing else. Only consider the following labels: {', '.join(labels_list)}. "},
{"role": "assistant", "content": "Ok"},
{"role": "user", "content": prompt}
],
temperature=0,
max_tokens=150,
top_p=1,
frequency_penalty=0,
presence_penalty=0
)
labels = response.choices[0].message.content.strip()
logging.info(f"OpenAI suggested labels: {labels}")
return labels.split(",") # Assuming OpenAI returns comma-separated labels
except Exception as e:
logging.error(f"Error communicating with OpenAI API: {e}")
return []
def add_confluence_page_labels(page_id, labels):
"""Add labels to a Confluence page."""
url = f"{CONFLUENCE_BASE_URL}/content/{page_id}/label"
label_payload = [{"prefix": "global", "name": label.strip()} for label in labels]
response = requests.post(
url,
auth=HTTPBasicAuth(CONFLUENCE_USERNAME, CONFLUENCE_API_TOKEN),
headers={"Content-Type": "application/json"},
json=label_payload
)
if response.status_code == 200:
logging.info(f"Successfully added labels {labels} to page ID {page_id}")
else:
logging.error(f"Failed to add labels to page ID {page_id}, Status Code: {response.status_code}")
def process_confluence_space(space_key, labels_list):
"""Process all pages within a Confluence space by retrieving content, analyzing it with OpenAI, and adding labels."""
pages = get_confluence_space_pages(space_key)
if not pages:
logging.warning(f"No pages found for space key {space_key}")
return
logging.info(f"Found {len(pages)} pages in space {space_key}")
for page in pages:
page_id = page['id']
page_title = page['title']
logging.info(f"Processing page ID {page_id} with title: {page_title}")
page_content = page['body']['storage']['value']
page_text = extract_text_from_html(page_content)
# Analyze content with OpenAI
openai_labels = get_openai_labels(page_title, page_text, labels_list)
# Filter OpenAI labels against the provided list
relevant_labels = [label for label in openai_labels if label.strip() in labels_list]
if relevant_labels:
add_confluence_page_labels(page_id, relevant_labels)
else:
logging.info(f"No relevant labels found for page ID {page_id}")
# Example usage
if __name__ == "__main__":
# Space key input
space_key = input("Enter the Confluence space key to analyze: ")
# Provided list of allowed labels
allowed_labels = ["documentation", "tutorial", "policy", "meeting notes", "architecture", "release notes", "java", "python", "REST_API"]
# Process the Confluence space
process_confluence_space(space_key, allowed_labels)
How It Works
- Fetching Pages: The script connects to the Confluence API and retrieves all pages within a specified space. This makes it easy to process an entire section of your documentation at once.
- Extracting Page Content: Using BeautifulSoup, the script extracts plain text from the HTML content of each Confluence page. This cleaned-up content is then ready for analysis.
- Generating Labels with OpenAI: The OpenAI API is used to analyze the text of each page and suggest relevant labels based on a predefined list. This ensures that the labels applied are both meaningful and consistent with your existing content strategy.
- Applying Labels: Once the relevant labels are determined, the script uses the Confluence API to add these labels to each page automatically.

This approach allows for quick content classification without having to manually review each page. Labels not only make content more searchable but also provide a powerful way to group similar topics, making your Confluence space more organized and easier to navigate.