mod_notes_links.py

"""
mod_notes_links.py v1.9
Julián

Manage Markdown files:
- Scan every website index.html files
- Search attached files pattern
    - Set relative path
    - Change alt text by file name
    - Replace image tag for classic file tag
- Search external links pattern
    - Update link to open in a tab
"""

import os
import re
from datetime import datetime

# Custom variables
WEBSITE_PAGES = "/var/www/<website_name>/public/posts"
DOMAIN = ('<ip_address>', '<domain_name>')
DEBUG = False
# DEBUG = True
DEBUG_FILE = True
DEBUG_LINK = True
TEST = False
# TEST = True

# Variables
PATTERN_FILE = re.compile(r'<p><img\s+(?P<alt>alt=".*?"\s+)?src=".*\/(?P<filename>.*?)(?P<extension>\..*?)">')
PATTERN_LINK = re.compile(r'<a href="https?:\/\/(?P<domain>.*?)"+?\s*(?P<target>target=".*")?>.*?<\/a>')
IMAGE_EXTENSIONS = ('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp')

# Print header
print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] >>> mod_notes_links.py")
s = "DEBUG=True" if DEBUG else "DEBUG=False"
print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]", s)
s = "TEST=True" if TEST else "TEST=False"
print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]", s)

# Scan every website index.html files
for dirpath, dirnames, filenames in os.walk(WEBSITE_PAGES):
    for filename in filenames:
        if filename == "index.html":
            filepath = os.path.join(dirpath, filename)
            new_file = []
            modified = False

            with open(filepath, "r", encoding="utf-8") as file:
                for line in file:

                    # Test patterns
                    match_file = PATTERN_FILE.search(line)
                    match_link = PATTERN_LINK.search(line)

                    # Step 1: File match
                    if match_file:
                        if DEBUG and DEBUG_FILE: 
                            print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] --- {filepath}")
                            print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] ------ MATCH FILE")

                        new_line = line

                        # Set relative path
                        new_line = new_line.replace("src=\"", "src=\"../", 1)

						# Change alt text by filename
                        if match_file.group("alt"):
                            new_line = new_line.replace(match_file.group("alt"), "alt=\""+match_file.group("filename")+"\" ", 1)

                        # Replace image tag for classic file tag
                        if not match_file.group("extension").lower().endswith(IMAGE_EXTENSIONS):
                            new_line = new_line.replace("<p><img src=\"", "<a href=\"", 1)
                            new_line = new_line.replace("\"></p>", "\">"+match_file.group("filename")+match_file.group("extension")+"</a>", 1)

                        # Add new line to new file
                        new_file.append(new_line)
                        modified = True

                        # Debug
                        if DEBUG and DEBUG_FILE:
                            print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]  old_line:", line)
                            # print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]     match_file:", match_file.group(0))
                            print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]       alt:", match_file.group("alt"))
                            print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]  filename:", match_file.group("filename"))
                            print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] extension:", match_file.group("extension"))
                            print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]  new_line:", new_line)

                    # Step 2: Link match
                    elif match_link:
                        if DEBUG and DEBUG_LINK:
                            print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] --- {filepath}")
                            print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] ------ MATCH LINK")

                        new_line = line

                        if not match_link.group("domain").lower().endswith(DOMAIN):
                            if not match_link.group("target"):
                                new_line = re.sub(r'(<a\s+href="[^"]+")', r'\1 target="_blank"', new_line)

                        # Add new line to new file
                        new_file.append(new_line)
                        modified = True

                        # Debug
                        if DEBUG and DEBUG_LINK:
                            print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]   old_line:", line)
                            # print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] match_link:", match_link.group(0))
                            print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]     domain:", match_link.group("domain"))
                            print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]     target:", match_link.group("target"))
                            print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]   new_line:", new_line)

                    else:
                        # Add old line to new file
                        new_file.append(line)

            # Write back new file if something changed
            if modified and not TEST:
                with open(filepath, "w", encoding="utf-8") as file:
                    file.writelines(new_file)