In [8]:
import os
from lxml import etree
from IPython.display import display, HTML
import base64
import html
import re

# --- Constants and Namespace ---
NS = {
    'tei': 'http://www.tei-c.org/ns/1.0',
    'xml': 'http://www.w3.org/XML/1998/namespace'
}
XML_FILE = '../../Downloads/pharr2025v.xml'
XML_FILE = 'pharr2025.xml'

OUTPUT_FILE = "homeric_greek_three_pane.html"

# --- Embedded CSS for Three-Pane Layout ---
CSS_STYLE = """
<style>
    @import url('https://fonts.googleapis.com/css2?family=Cardo:ital,wght@0,400;0,700;1,400&family=Noto+Serif+Greek:wght@400;700&display=swap');

    :root {
        --sidebar-width: 280px;
        --primary-color: #0056b3;
        --background-color: #f4f4f0;
        --pane-background: #fdfdfa;
        --text-color: #333;
        --border-color: #ddd;
        --pane-padding: 30px;
        --highlight-bg: #FFF9C4;
        --highlight-text: #5D4037;
    }
    
    html {
        scroll-behavior: smooth;
    }

    body {
        font-family: 'Cardo', serif;
        line-height: 1.7;
        margin: 0; padding: 0;
        display: flex;
        overflow: hidden;
        background-color: var(--background-color);
        color: var(--text-color);
    }

    #sidebar {
        width: var(--sidebar-width);
        min-width: var(--sidebar-width);
        height: 100vh;
        position: fixed; left: 0; top: 0;
        background-color: #f0f0e8;
        border-right: 1px solid var(--border-color);
        overflow-y: auto;
        padding: 20px;
        box-sizing: border-box;
    }
    
    .sidebar-header {
        padding-bottom: 10px;
        margin-bottom: 15px;
        border-bottom: 1px solid var(--border-color);
        font-size: 0.85em;
        line-height: 1.5;
    }
    .sidebar-header h1 {
        font-size: 1.4em;
        margin: 0 0 5px 0;
        line-height: 1.2;
        text-align: left;
    }
    .sidebar-header p {
        margin: 2px 0;
        font-size: 1em;
        text-align: left;
    }
    .sidebar-header a {
        font-weight: bold;
    }

    #sidebar h2 { font-size: 1.3em; margin-top: 0; color: #000; }
    #sidebar ul { list-style-type: none; padding: 0; margin: 0; }
    #sidebar ul.toc-level-1 > li { margin-bottom: 8px; font-weight: bold; font-size: 1em;}
    #sidebar ul.toc-level-2 { padding-left: 15px; font-weight: normal; }
    #sidebar ul.toc-level-2 > li { margin-bottom: 4px; font-size: 0.9em; }
    #sidebar a { text-decoration: none; color: var(--primary-color); display: block; padding: 2px 0; }
    #sidebar a:hover { text-decoration: underline; color: #003d7c; }
    
    .content-pane {
        flex: 1;
        height: 100vh;
        overflow-y: auto;
        padding: 20px var(--pane-padding);
        box-sizing: border-box;
        background-color: var(--pane-background);
    }
    #main-content-center {
        margin-left: var(--sidebar-width);
        border-right: 1px solid var(--border-color);
    }
    
    .page-anchor { display: block; position: relative; top: -70px; visibility: hidden; }
    
    .page-milestone {
        font-weight: bold;
        color: #888;
        padding: 0 0.25em;
    }

    .line-number {
        flex: 0 0 40px;
        padding-right: 10px;
        text-align: right;
        color: #aaa;
        font-size: 0.8em;
        font-family: monospace;
    }
    .line-link {
        text-decoration: none;
    }
    .lg {
        display: block;
        margin: 1em 0;
        padding-left: 15px;
        border-left: 3px solid #ccc;
        font-style: italic;
    }
    .line {
        display: flex;
        align-items: baseline;
    }

    h1, h2, h3, h4 { color: #222; line-height: 1.2; margin-top: 1.8em; font-family: 'Cardo', serif; }
    h2 { font-size: 2em; border-bottom: 1px solid var(--border-color); padding-bottom: 5px; }
    h3 { font-size: 1.6em; }
    h4 { font-size: 1.3em; font-style: italic; }
    p { margin-bottom: 1em; text-align: justify; }
    p b:first-child {
        padding-right: 0.5em;
    }
    .greek { font-family: 'Noto Serif Greek', serif; }
    .center { text-align: center; }
    
    .lem {
        background-color: var(--highlight-bg);
        color: var(--highlight-text);
        padding: 1px 4px;
        border-radius: 3px;
        font-weight: bold;
    }

    figure { margin: 25px auto; padding: 15px; border: 1px solid #e0e0e0; background-color: #f9f9f9; text-align: center; max-width: 90%; }
    figcaption { font-style: italic; font-size: 0.9em; color: #555; margin-top: 8px; }
    table { width: 100%; border-collapse: collapse; margin: 25px 0; font-size: 0.95em; }
    caption { font-weight: bold; font-size: 1.1em; padding: 8px; margin-bottom: 10px; text-align: left; }
    th, td { border: 1px solid var(--border-color); padding: 8px; text-align: left; vertical-align: top; }
    th { background-color: #f2f2f2; font-weight: bold; }
    .footnote-section { font-size: 0.9em; border-top: 1px solid var(--border-color); margin-top: 2.5em; padding-top: 1em; }
    .footnote-ref { vertical-align: super; font-size: 0.75em; }
    blockquote { border-left: 3px solid #ccc; padding-left: 15px; margin-left: 0; font-style: italic;}
</style>
"""

# --- Embedded JavaScript for Interactive Scrolling ---
JS_SCRIPT = """
<script>
    document.addEventListener('DOMContentLoaded', function() {
        document.body.addEventListener('click', function(event) {
            let target = event.target.closest('a');

            if (target && target.getAttribute('href') && target.getAttribute('href').startsWith('#')) {
                event.preventDefault();
                
                const href = target.getAttribute('href');
                const baseId = href.substring(href.indexOf('#') + 1).replace(/^(center-|right-)/, '');

                const sourcePane = target.closest('#sidebar, #main-content-center, #main-content-right');
                
                if (!sourcePane || !baseId) return;

                const sourceId = sourcePane.id;

                if (sourceId === 'sidebar') {
                    // TOC click scrolls both panes
                    const centerTarget = document.getElementById('center-' + baseId);
                    const rightTarget = document.getElementById('right-' + baseId);
                    if (centerTarget) centerTarget.scrollIntoView({ behavior: 'smooth' });
                    if (rightTarget) rightTarget.scrollIntoView({ behavior: 'smooth' });
                } else if (sourceId === 'main-content-center') {
                    // Center click scrolls right pane
                    const rightTarget = document.getElementById('right-' + baseId);
                    if (rightTarget) rightTarget.scrollIntoView({ behavior: 'smooth' });
                } else if (sourceId === 'main-content-right') {
                    // Right click scrolls center pane
                    const centerTarget = document.getElementById('center-' + baseId);
                    if (centerTarget) centerTarget.scrollIntoView({ behavior: 'smooth' });
                }
            }
        });
    });
</script>
"""

class TeiToHtmlConverter:
    """Converts a TEI XML file into a standalone HTML file."""
    
    def __init__(self, xml_tree):
        self.tree = xml_tree
        self.footnotes = []
        self.footnote_counter = 0
        self._has_debugged_l_ancestors = False # ADD THIS LINE

    def process_children(self, element):
        """Processes element children, preserving and normalizing whitespace."""
        parts = []
        if element.text:
            parts.append(html.escape(element.text))
        
        for child in element:
            if isinstance(child, etree._Element):
                parts.append(self.process_element(child))
            if child.tail:
                parts.append(html.escape(child.tail))
        
        content = "".join(parts)
        content = re.sub(r'\s+', ' ', content)
        return content.strip()
        
    def process_element(self, element):
        """Dispatcher to handle different TEI elements."""
        tag = etree.QName(element.tag).localname
        handler = getattr(self, f'handle_{tag}', self.handle_default)
        return handler(element)

    def handle_default(self, element):
        return self.process_children(element)

    # --- Element Handlers ---
    def handle_p(self, element):
        content = self.process_children(element)
        parent = element.getparent()
        prefix = ''
        
        if etree.QName(parent.tag).localname == 'div':
            n_attr = parent.get('n')
            if n_attr:
                is_in_exercise = parent.xpath('ancestor-or-self::tei:div[@type="exercise"]', namespaces=NS)
                if parent.get('type') == 'sentence' or is_in_exercise:
                    prefix = f'<b>{n_attr}.</b> '

        extra_class = "center" if element.get('rend') == 'center' else ""
        return f'<p class="{extra_class}">{prefix}{content}</p>'


    def handle_head(self, element):
        parent_tag = etree.QName(element.getparent().tag).localname
        if parent_tag == 'table':
            return f"<caption>{self.process_children(element)}</caption>"
        if parent_tag == 'figure':
             return f"<h4>{self.process_children(element)}</h4>"
        
        level = 2
        parent_div_depth = len(element.xpath('ancestor::tei:div', namespaces=NS))
        if parent_div_depth > 1: level = 3
        if element.get('type') == 'sub': level = 4
        return f"<h{level}>{self.process_children(element)}</h{level}>"

    def handle_pb(self, element):
        """Handles page breaks by creating an anchor and a visible milestone."""
        pb_id = element.get(f'{{{NS["xml"]}}}id')
        n = element.get('n')
        anchor_id = pb_id if pb_id else f"page-{n}"
        
        anchor_html = f'<a class="page-anchor" id="{anchor_id}"></a>' if anchor_id else ''
        milestone_html = f' <span class="page-milestone">[pg. {n}]</span> ' if n else ''
        
        return f'{anchor_html}{milestone_html}'

    def handle_div(self, element):
        """Handles div elements, ensuring footnote state is properly scoped."""
        original_div_id = element.get(f'{{{NS["xml"]}}}id', '')
        output_section_id = original_div_id
        n_attr = element.get('n')
        subtype = element.get('subtype')
        prefix = ''

        id_match = re.match(r'^p(\d+)$', original_div_id)
        if id_match:
            num = id_match.group(1)
            prefix = f'<b>§{num}.</b> '
        elif subtype == 'commline' and n_attr:
            output_section_id = f"ln{n_attr}"
            link_target_num = n_attr.split('-')[0]
            prefix = f'<a href="#iliad1-line-{link_target_num}"><b>{n_attr}:</b></a> '

        # --- STATE MANAGEMENT FIX ---
        # 1. Save the current list of footnotes from any parent div.
        parent_footnotes = self.footnotes
        # 2. Create a new, empty list for the current div's scope.
        self.footnotes = []
        # --- END FIX ---

        content = self.process_children(element)
        final_content = prefix + content

        # This block now correctly renders only the footnotes found within this div.
        if self.footnotes:
            final_content += '<div class="footnote-section">'
            for note in self.footnotes:
                back_link = ''
                if 'backlink_id' in note and note['backlink_id']:
                    back_link = f' <a href="#{note["backlink_id"]}" title="Go back to text">&#8617;</a>'
                
                final_content += f'<p id="{note.get("id", "")}"><sup>{note.get("number", "*")}</sup> {note.get("text", "")}{back_link}</p>'
            final_content += '</div>'

        # --- STATE MANAGEMENT FIX ---
        # 3. Restore the parent's footnote list before returning.
        self.footnotes = parent_footnotes
        # --- END FIX ---
            
        return f'<section id="{output_section_id}">{final_content}</section>'
        
    def handle_div_bak1(self, element):
        """Handles div elements, including special formatting and numbering."""
        original_div_id = element.get(f'{{{NS["xml"]}}}id', '')
        output_section_id = original_div_id
        n_attr = element.get('n')
        subtype = element.get('subtype')
        prefix = ''

        id_match = re.match(r'^p(\d+)$', original_div_id)
        if id_match:
            num = id_match.group(1)
            prefix = f'<b>§{num}.</b> '
        elif subtype == 'commline' and n_attr:
            # Set the section ID to the full range, e.g., "ln81-82"
            output_section_id = f"ln{n_attr}"
            
            # For the href, take only the first number before any hyphen.
            link_target_num = n_attr.split('-')[0]
            
            # Build the link with the correct href, but display the full range.
            prefix = f'<a href="#iliad1-line-{link_target_num}"><b>{n_attr}:</b></a> '

        self.footnotes = []
        self.footnote_counter = 0
        content = self.process_children(element)
        
        final_content = prefix + content

        # --- NEW BLOCK ---
        if self.footnotes:
            final_content += '<div class="footnote-section">'
            for note in self.footnotes:
                # Conditionally add a back-link only if one was created (for inline notes)
                back_link = ''
                if 'backlink_id' in note and note['backlink_id']:
                    back_link = f' <a href="#{note["backlink_id"]}" title="Go back to text">&#8617;</a>'
                
                # Use the 'id' and 'number' from the dictionary populated by handle_note
                final_content += f'<p id="{note["id"]}"><sup>{note["number"]}</sup> {note["text"]}{back_link}</p>'
            final_content += '</div>'
        # --- NEW BLOCK ---
        #if self.footnotes:
        #    final_content += '<div class="footnote-section">'
        #    for note in self.footnotes:
            # Optional: Add a back-link (e.g., an arrow ↩) from the note back to the text.
        #        back_link = f'<a href="#{note["backlink_id"]}" title="Go back to text">&#8617;</a>'
        #        final_content += f'<p id="{note["id"]}"><sup>{note["number"]}</sup> {note["text"]} {back_link}</p>'
        #    final_content += '</div>'

        

        #if self.footnotes:
        #    final_content += '<div class="footnote-section">'
        #    for i, note_text in enumerate(self.footnotes, 1):
         #       final_content += f'<p id="fn-{output_section_id}-{i}"><sup>{i}</sup> {note_text}</p>'
         #   final_content += '</div>'
            
        return f'<section id="{output_section_id}">{final_content}</section>'


    def handle_div_bak(self, element):
        """Handles div elements, including special formatting and numbering."""
        original_div_id = element.get(f'{{{NS["xml"]}}}id', '')
        output_section_id = original_div_id
        n_attr = element.get('n')
        subtype = element.get('subtype')
        prefix = ''

        id_match = re.match(r'^p(\d+)$', original_div_id)
        if id_match:
            num = id_match.group(1)
            prefix = f'<b>§{num}.</b> '
        elif subtype == 'commline' and n_attr:
            output_section_id = f"ln{n_attr}"
            prefix = f'<a href="#iliad1-line-{n_attr}"><b>{n_attr}:</b></a> '

        self.footnotes = []
        self.footnote_counter = 0
        content = self.process_children(element)
        
        final_content = prefix + content

        if self.footnotes:
            final_content += '<div class="footnote-section">'
            for i, note_text in enumerate(self.footnotes, 1):
                final_content += f'<p id="fn-{output_section_id}-{i}"><sup>{i}</sup> {note_text}</p>'
            final_content += '</div>'
            
        return f'<section id="{output_section_id}">{final_content}</section>'

    def handle_foreign(self, element):
        lang = element.get(f'{{{NS["xml"]}}}lang')
        cls = "greek" if lang == 'grc' else ""
        return f'<span class="{cls}"><i>{self.process_children(element)}</i></span>'

    def handle_title(self, element): return f'<i>{self.process_children(element)}</i>'
    def handle_lb(self, element): return "<br/>"
    def handle_list(self, element): return f"<ul>{self.process_children(element)}</ul>"
    def handle_item(self, element): return f"<li>{self.process_children(element)}</li>"
    
    # Table Handling
    def handle_table(self, element): return f"<table>{self.process_children(element)}</table>"
    def handle_tbody(self, element): return f"<tbody>{self.process_children(element)}</tbody>"
    def handle_thead(self, element): return f"<thead>{self.process_children(element)}</thead>"
    def handle_row(self, element): return f"<tr>{self.process_children(element)}</tr>"
    def handle_tr(self, element): return f"<tr>{self.process_children(element)}</tr>"
    def handle_cell(self, element):
        tag = 'th' if element.getparent().get('role') == 'label' else 'td'
        colspan = f" colspan='{element.get('cols')}'" if element.get('cols') else ''
        content = self.process_children(element)
        return f"<{tag}{colspan}>{content}</{tag}>"
    def handle_td(self, element):
        colspan = f" colspan='{element.get('colspan')}'" if element.get('colspan') else ""
        content = self.process_children(element)
        return f"<td{colspan}>{content}</td>"

    def handle_lem(self, element):
        return f'<span class="lem">{self.process_children(element)}</span>'
    
    def handle_figure(self, element):
        return f"<figure>{self.process_children(element)}</figure>"
    def handle_figDesc(self, element):
        return f"<figcaption>{self.process_children(element)}</figcaption>"

    def handle_note(self, element):
        """
        Processes a <note> element.
        - If place="foot", it queues the note content for display at the end of the section
          and returns an empty string (the in-text link is handled by a separate <ref> tag).
        - If place="inline" or no place is specified, it generates an in-text reference
          and queues the content.
        """
        place = element.get('place', 'inline')
        note_text = self.process_children(element)

        if place == 'foot':
            # This is the body of a footnote, referenced by a <ref> tag elsewhere.
            note_id = element.get(f'{{{NS["xml"]}}}id')
            note_num = element.get('n')
            
            # This kind of note MUST have an xml:id to be linked to.
            if not note_id:
                return ""

            self.footnotes.append({
                'id': note_id,
                'number': note_num,
                'text': note_text
                # No 'backlink_id' is created here because the ref is separate.
            })
            # Return an empty string; the <ref> tag creates the in-text link.
            return ''
        else: # place="inline" or default behavior
            # This note creates its own in-text reference.
            note_number = len(self.footnotes) + 1
            
            div_ancestors = element.xpath('ancestor::tei:div[1]', namespaces=NS)
            div_ancestor = div_ancestors[0] if div_ancestors else None
            div_id = div_ancestor.get(f'{{{NS["xml"]}}}id', 'unknown') if div_ancestor is not None else 'unknown'

            ref_id = f"fnref-{div_id}-{note_number}"
            note_id = f"fn-{div_id}-{note_number}"

            self.footnotes.append({
                'id': note_id,
                'number': note_number,
                'text': note_text,
                'backlink_id': ref_id
            })
            # Return the clickable superscript reference.
            return f'<a id="{ref_id}" href="#{note_id}" class="footnote-ref"><sup>{note_number}</sup></a>'

    def handle_note_bak(self, element):
        """
        Processes a <note> element, generating an in-text reference
        and queuing the footnote content for rendering at the end of its section.
        """
        # 1. Determine the note number.
        note_number = len(self.footnotes) + 1

        # 2. Get the xml:id of the parent <div> to ensure footnote links are unique.
        #    CORRECTION: Use .xpath() for full XPath support, as .find() does not
        #    support the 'ancestor::' axis. .xpath() returns a list.
        div_ancestors = element.xpath('ancestor::tei:div[1]', namespaces=NS)
        
        # Take the first element from the list if it exists.
        div_ancestor = div_ancestors[0] if div_ancestors else None

        # Use a fallback 'unknown' if the note is not in a div with an id.
        div_id = div_ancestor.get(f'{{{NS["xml"]}}}id', 'unknown') if div_ancestor is not None else 'unknown'
        
        # 3. Create unique and consistent IDs for the reference and the footnote.
        ref_id = f"fnref-{div_id}-{note_number}"
        note_id = f"fn-{div_id}-{note_number}"

        # 4. Process the content of the footnote.
        note_text = self.process_children(element)

        # 5. Append a structured dictionary to the list.
        self.footnotes.append({
            'id': note_id,
            'number': note_number,
            'text': note_text,
            'backlink_id': ref_id
        })

        # 6. Return the clickable superscript reference.
        return f'<a id="{ref_id}" href="#{note_id}" class="footnote-ref"><sup>{note_number}</sup></a>'

    def handle_note_bak1(self, element):
        """
        Processes a <note> element, generating an in-text reference
        and queuing the footnote content for rendering at the end of its section.
        """
        # 1. Determine the note number based on how many notes have already been processed
        #    in the current section. This removes the need for self.footnote_counter.
        note_number = len(self.footnotes) + 1

        # 2. Get the xml:id of the parent <div> to ensure footnote links are unique
        #    across the entire document.
        div_ancestor = element.find('ancestor::tei:div[1]', namespaces=NS)
        # Use a fallback 'unknown' if the note is not in a div with an id.
        div_id = div_ancestor.get(f'{{{NS["xml"]}}}id', 'unknown') if div_ancestor is not None else 'unknown'

        # 3. Create unique and consistent IDs for the in-text reference and the footnote itself.
        #    - fnref-*: The <a> tag in the text, allowing a backlink from the footnote.
        #    - fn-*: The <p> tag in the footnote section, which is the target of the link.
        ref_id = f"fnref-{div_id}-{note_number}"
        note_id = f"fn-{div_id}-{note_number}"

        # 4. Process the content of the footnote.
        note_text = self.process_children(element)

        # 5. Append a structured dictionary to the list. This makes the data easier to
        #    use when rendering the footnote section in the handle_div method.
        self.footnotes.append({
            'id': note_id,
            'number': note_number,
            'text': note_text,
            'backlink_id': ref_id
        })

        # 6. Return the clickable superscript reference to be placed in the main text.
        return f'<a id="{ref_id}" href="#{note_id}" class="footnote-ref"><sup>{note_number}</sup></a>'

    def handle_note_bak(self, element):
        self.footnote_counter += 1
        note_text = self.process_children(element)
        self.footnotes.append(note_text)
        
        div_id_elements = element.xpath('ancestor::tei:div[1]/@xml:id', namespaces=NS)
        div_id = div_id_elements[0] if div_id_elements else 'section'
        
        note_num = element.get('n') or self.footnote_counter

        preceding_pbs = element.xpath('preceding::tei:pb[last()]', namespaces=NS)
        page_num = None
        if preceding_pbs:
            page_num = preceding_pbs[0].get('n')

        if page_num:
            num_display = f"pg. {page_num} n. {note_num}"
        else:
            num_display = note_num
            
        # --- MODIFIED LINE ---
        # Added an id="fnref-..." to the link to serve as a return anchor.
        return f'<a id="fnref-{div_id}-{self.footnote_counter}" href="#fn-{div_id}-{self.footnote_counter}" class="footnote-ref">{num_display}</a>'

    

            
    def handle_lg(self, element): return f"<blockquote class='lg'>{self.process_children(element)}</blockquote>"
    

            
    def handle_l_bak(self, element):
        """Handles lines of poetry, adding clickable line numbers and an anchor."""
        content = self.process_children(element)
        line_num_html = ''
        anchor_id = ''
        
        line_num = element.get('n')
        if line_num:
            span_xml_id = ''  # Initialize an empty string for the xml:id attribute

            is_in_iliad1 = element.xpath('ancestor::tei:div[@xml:id="iliad1"]', namespaces=NS)
            if is_in_iliad1:
                anchor_id = f' id="iliad1-line-{line_num}"'
                # Construct the xml:id for the span if in Iliad 1
                span_xml_id = f' xml:id="il1-{line_num}"'
            
            # Add the span_xml_id variable to the span tag
            line_num_html = f'<a class="line-link" href="#ln{line_num}"><span class="line-number"{span_xml_id}>{line_num}</span></a>'

        return f'<div class="line"{anchor_id}>{line_num_html}{content}</div>'

    def handle_l(self, element):
        """Handles lines of poetry, adding clickable line numbers and an anchor."""
        content = self.process_children(element)
        line_num = element.get('n')
        
        # Get the xml:id directly from the <l> element itself
        line_xml_id = element.get(f"{{{NS['xml']}}}id")
        
        span_id_attr = ''
        div_id_attr = ''

        # If the <l> tag has an xml:id, use it for the <span>
        if line_xml_id:
            span_id_attr = f' xml:id="{line_xml_id}"'
            
            # Also, if it's an Iliad 1 line, create the anchor for the parent <div>.
            # This is for other parts of the text (like the commentary) to link to this line.
            if line_xml_id.startswith('il1-'):
                div_id_attr = f' id="iliad1-line-{line_num}"'

        # Build the line number HTML. The span will now include the xml:id if it was found.
        line_num_html = ''
        if line_num:
            line_num_html = f'<a class="line-link" href="#ln{line_num}"><span class="line-number"{span_id_attr}>{line_num}</span></a>'
        
        # Assemble the final HTML for the line, including the div's ID if needed.
        return f'<div class="line"{div_id_attr}>{line_num_html}{content}</div>'    

    def handle_l_bak3(self, element):
        """Handles lines of poetry, adding clickable line numbers and an anchor."""
        content = self.process_children(element)
        line_num_html = ''
        anchor_id = ''

        line_num = element.get('n')

        # --- TEMPORARY DEBUG BLOCK ---
        # This will print the ancestors of line 173 to help us find the correct ID.
        if line_num == '173' and not self._has_debugged_l_ancestors:
            print("\n" + "="*25)
            print("DEBUGGING ANCESTOR INFO FOR LINE 173")
            print("="*25)
            ancestors = element.xpath('ancestor::*')
            for i, ancestor in enumerate(reversed(ancestors)): # Reversed to show top-down
                tag_name = etree.QName(ancestor.tag).localname
                # Create a clean dictionary of attributes, handling namespaces
                attrs = {}
                for key, val in ancestor.attrib.items():
                    attr_name = etree.QName(key).localname
                    if 'id' in attr_name: # Emphasize IDs
                        attr_name = f"***{attr_name.upper()}***"
                    attrs[attr_name] = val

                print(f"Level {i+1}: <{tag_name}>  Attributes: {attrs}")
            print("="*25 + "\n")
            self._has_debugged_l_ancestors = True
        # --- END DEBUG BLOCK ---

        if line_num:
            span_xml_id = ''
            
            # We are using the flexible XPath from before, which we now expect to fail.
            # The debug output above will tell us what to replace this with.
            is_in_iliad1 = element.xpath(
                "ancestor::tei:div[contains(translate(@xml:id, 'ILIAD', 'iliad'), 'iliad') and contains(@xml:id, '1')]", 
                namespaces=NS
            )

            if is_in_iliad1:
                anchor_id = f' id="iliad1-line-{line_num}"'
                span_xml_id = f' xml:id="il1-{line_num}"'
            
            line_num_html = f'<a class="line-link" href="#ln{line_num}"><span class="line-number"{span_xml_id}>{line_num}</span></a>'

        return f'<div class="line"{anchor_id}>{line_num_html}{content}</div>'    

    def handle_l_bak2(self, element):
        """Handles lines of poetry, adding clickable line numbers and an anchor."""
        content = self.process_children(element)
        line_num_html = ''
        anchor_id = ''
        
        line_num = element.get('n')
        if line_num:
            span_xml_id = ''  # Initialize an empty string for the xml:id attribute

            is_in_iliad1 = element.xpath('ancestor::tei:div[@xml:id="iliad1"]', namespaces=NS)
            if is_in_iliad1:
                anchor_id = f' id="iliad1-line-{line_num}"'
                # Construct the xml:id for the span if in Iliad 1
                span_xml_id = f' xml:id="il1-{line_num}"'
            
            # Add the span_xml_id variable to the span tag
            line_num_html = f'<a class="line-link" href="#ln{line_num}"><span class="line-number"{span_xml_id}>{line_num}</span></a>'

        return f'<div class="line"{anchor_id}>{line_num_html}{content}</div>'
    def handle_quote(self, element):
        """Handles quotes. If it's a poetry quote, let the lg handler do the styling."""
        if element.find('.//tei:lg', namespaces=NS) is not None:
            return self.process_children(element)
        return f"<blockquote>{self.process_children(element)}</blockquote>"

    def handle_hi(self, element):
        rend = element.get('rend', '')
        if 'bold' in rend: return f"<b>{self.process_children(element)}</b>"
        if 'italic' in rend: return f"<i>{self.process_children(element)}</i>"
        if 'superscript' in rend: return f"<sup>{self.process_children(element)}</sup>"
        return self.process_children(element)

    def handle_ref(self, element):
        target = element.get("target", "")
        text = (self.process_children(element) or "").strip()
        target_attr = ' target="_blank"' if target.startswith('http') else ''

        # --- CORRECTED LOGIC START ---
        # Correctly check for the ranged target pattern like "#p597-#p598"
        # The regex now correctly looks for the second "#p".
        range_match = re.match(r'^#p(\d+)-#p(\d+)$', target)
        
        if range_match:
            start_num = range_match.group(1)
            end_num = range_match.group(2)

            # Create two separate links using the numbers from the target
            start_link = f'<a href="#p{start_num}">{start_num}</a>'
            end_link = f'<a href="#p{end_num}">{end_num}</a>'
            
            return f'{start_link}-{end_link}'
        # --- CORRECTED LOGIC END ---

        # Original logic for other cases (including http links and text-based refs) follows
        if not target and text:
            sub_para_pattern = r'^\d+,\s*\d+$'
            if re.match(sub_para_pattern, text):
                first_num = text.split(',')[0].strip()
                link_target = f"#p{first_num}"
                return f'<a href="{link_target}">{text}</a>'
            elif ',' in text:
                parts = text.split(',')
                links = [self.handle_ref_part(part.strip()) for part in parts]
                return ', '.join(links)
            else:
                return self.handle_ref_part(text)
        if target:
            if(re.match('^[0-9]$',text)):
                return f'<a href="{target}"{target_attr}><sup>{text}</sup></a>'
            else:
                return f'<a href="{target}"{target_attr}>{text}</a>'
        return text
    
    def handle_ref_bak(self, element):
        target = element.get("target", "")
        text = (self.process_children(element) or "").strip()
        target_attr = ' target="_blank"' if target.startswith('http') else ''
        if not target and text:
            sub_para_pattern = r'^\d+,\s*\d+$'
            if re.match(sub_para_pattern, text):
                first_num = text.split(',')[0].strip()
                link_target = f"#p{first_num}"
                return f'<a href="{link_target}">{text}</a>'
            elif ',' in text:
                parts = text.split(',')
                links = [self.handle_ref_part(part.strip()) for part in parts]
                return ', '.join(links)
            else:
                return self.handle_ref_part(text)
        if target:
            return f'<a href="{target}"{target_attr}>{text}</a>'
        return text

    


    def handle_ref_part(self, text_part):
        """Helper to process a single number or a hyphenated range."""
        # --- NEW LOGIC START ---
        # Check if the part contains a hyphen, indicating a range
        if '-' in text_part:
            parts = text_part.split('-')
            # Ensure we have exactly two parts and they are both numbers
            if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
                start_num, end_num = parts
                start_link = f'<a href="#p{start_num}">{start_num}</a>'
                end_link = f'<a href="#p{end_num}">{end_num}</a>'
                return f'{start_link}-{end_link}'
        # --- NEW LOGIC END ---
        
        # Original logic for single numbers
        elif text_part.isdigit():
            target = f"#p{text_part}"
            return f'<a href="{target}">{text_part}</a>'
            
        # Fallback for non-numeric/non-range text
        return text_part

    

    def handle_cit(self, element): return self.handle_quote(element)
    def handle_term(self, element): return f"<b>{self.process_children(element)}</b>"
    def handle_gloss(self, element): return f"<i>{self.process_children(element)}</i>"
    def handle_app(self, element): return self.process_children(element)

    def build_sidebar_header(self):
        """Extracts metadata from the TEI header to build an HTML header for the sidebar."""
        header_parts = []
        
        title_el = self.tree.find('.//tei:titleStmt/tei:title', NS)
        if title_el is not None and title_el.text:
            header_parts.append(f"<h1>{html.escape(title_el.text)}</h1>")
            
        author_el = self.tree.find('.//tei:titleStmt/tei:author', NS)
        if author_el is not None and author_el.text:
            header_parts.append(f"<p><strong>Author:</strong> {html.escape(author_el.text)}</p>")
        
        pub_stmt = self.tree.find('.//tei:publicationStmt', NS)
        if pub_stmt is not None:
            publisher = pub_stmt.find('tei:publisher', NS)
            pub_place = pub_stmt.find('tei:pubPlace', NS)
            date = pub_stmt.find('tei:date', NS)
            pub_info = []
            if publisher is not None and publisher.text: pub_info.append(publisher.text)
            if pub_place is not None and pub_place.text: pub_info.append(pub_place.text)
            if date is not None and date.text: pub_info.append(date.text)
            if pub_info:
                header_parts.append(f"<p><strong>Published:</strong> {html.escape(', '.join(pub_info))}</p>")

        ref_el = self.tree.find('.//tei:sourceDesc/tei:bibl/tei:ref', NS)
        if ref_el is not None:
            target = ref_el.get('target', '')
            text = ref_el.text or "View Source Scan"
            if target:
                header_parts.append(f'<p><a href="{target}" target="_blank">{html.escape(text)}</a></p>')

        if not header_parts:
            return ""
            
        return f'<header class="sidebar-header">{"".join(header_parts)}</header>'


# THIS IS THE NEW, CORRECTED CODE
    def build_toc_bak(self):
            """Builds the HTML for the Table of Contents sidebar."""
            toc_div = self.tree.find('.//tei:div[@xml:id="toc"]', NS)
            if toc_div is None: return "<h2>Table of Contents Not Found</h2>"
    
            html_toc = "<h2>Contents</h2>"
            in_lessons_section = False
            html_toc += '<ul class="toc-level-1">'

            for item in toc_div.findall('.//tei:item', NS):
            # --- Text processing (same as before) ---
                ref = item.find('tei:ref', NS)
                full_text = ' '.join("".join(item.itertext()).split())
                ref_text = (ref.text or "").strip() if ref is not None else ""
        
                if ref_text and full_text.endswith(ref_text):
                    link_text = full_text[:-len(ref_text)].strip()
                else:
                    link_text = full_text
        
                # Skip the generic "LESSON" header item
                if link_text.upper() == 'LESSON':
                    continue

                # Check if we are entering the lessons section based on the link text
                is_lesson = re.match(r'^\d+\.', link_text.strip())
        
                if is_lesson and not in_lessons_section:
                    html_toc += '</ul><h3>Lessons</h3><ul class="toc-level-2">'
                    in_lessons_section = True

                # --- Linking logic (same as before) ---
                target = ""
                if ref is not None:
                # Priority 1: Use explicit @target attribute if it exists on the <ref>
                    target = ref.get('target', '')

                    # Priority 2: If no explicit target, check if it's a lesson and link to its xml:id
                    if not target and is_lesson:
                        lesson_num_str = is_lesson.group(0).replace('.', '')
                        lesson_id = f"lesson{lesson_num_str}"
                
                        lesson_div = self.tree.find(f'.//tei:div[@xml:id="{lesson_id}"]', NS)
                        if lesson_div is not None:
                            target = f"#{lesson_id}"
            
                    # Priority 3 (Fallback): If no target was found, link to the page break <pb/>
                    if not target:
                        ref_text_content = ref_text.replace('p. ', '').strip()
                        if ref_text_content:
                            pb_el = self.tree.find(f'.//tei:pb[@n="{ref_text_content}"]', NS)
                            if pb_el is not None:
                                pb_id = pb_el.get(f'{{{NS["xml"]}}}id')
                                anchor_id = pb_id if pb_id else f"page-{ref_text_content}"
                                target = f"#{anchor_id}"

        # --- HTML Generation (MODIFIED) ---
        # The line that created a separate 'display_text' has been removed.
        # We now use 'link_text' directly to ensure the lesson number is displayed.
                if target:
                    base_anchor = target.lstrip('#') 
                    html_toc += f'<li><a href="#center-{base_anchor}">{link_text}</a></li>'
                elif link_text: 
                     html_toc += f'<li>{link_text}</li>'

            html_toc += '</ul>'
            return html_toc
      
    def build_toc(self):
        """Builds the HTML for the Table of Contents sidebar."""
        toc_div = self.tree.find('.//tei:div[@xml:id="toc"]', NS)
        if toc_div is None: return "<h2>Table of Contents Not Found</h2>"
        
        html_toc = "<h2>Contents</h2>"
        in_lessons_section = False
        html_toc += '<ul class="toc-level-1">'

        for item in toc_div.findall('.//tei:item', NS):
            # --- Text processing (same as before) ---
            ref = item.find('tei:ref', NS)
            full_text = ' '.join("".join(item.itertext()).split())
            ref_text = (ref.text or "").strip() if ref is not None else ""
        
            if ref_text and full_text.endswith(ref_text):
                link_text = full_text[:-len(ref_text)].strip()
            else:
                link_text = full_text
            
            # Skip the generic "LESSON" header item
            if link_text.upper() == 'LESSON':
                continue

            # Check if we are entering the lessons section
            is_lesson = re.match(r'^\d+\.', link_text.strip())

            if is_lesson and not in_lessons_section:
                html_toc += '</ul><h3>Lessons</h3><ul class="toc-level-2">'
                in_lessons_section = True
            
            # --- CORRECTED Linking logic ---
            target = ""
            
            # PRIORITY 1: If it's a lesson, create a link to the lesson's xml:id
            if is_lesson:
                lesson_num_str = is_lesson.group(0).replace('.', '')
                lesson_id = f"lesson{lesson_num_str}"
                
                # Check if a div with this lesson_id actually exists in the document
                lesson_div = self.tree.find(f'.//tei:div[@xml:id="{lesson_id}"]', NS)
                if lesson_div is not None:
                    target = f"#{lesson_id}"
            
            # PRIORITY 2 (FALLBACK): If a lesson link wasn't created, or it's not a lesson, use the <ref> tag.
            if not target and ref is not None:
                # First, try the explicit @target attribute
                ref_target = ref.get('target', '')
                if ref_target:
                    target = ref_target
                else:
                    # If no @target, fall back to the text content to find a page break
                    ref_text_content = ref_text.replace('p. ', '').strip()
                    if ref_text_content:
                        pb_el = self.tree.find(f'.//tei:pb[@n="{ref_text_content}"]', NS)
                        if pb_el is not None:
                            pb_id = pb_el.get(f'{{{NS["xml"]}}}id')
                            # The anchor ID is the pb's xml:id if it exists, otherwise construct one
                            anchor_id = pb_id if pb_id else f"page-{ref_text_content}"
                            target = f"#{anchor_id}"

            # --- HTML Generation ---
            if target:
                base_anchor = target.lstrip('#') 
                html_toc += f'<li><a href="#center-{base_anchor}">{link_text}</a></li>'
            elif link_text: 
                html_toc += f'<li>{link_text}</li>'

        html_toc += '</ul>'
        return html_toc
    

    def convert(self):
        """Main conversion method to generate the full HTML document."""
        title_element = self.tree.find('.//tei:titleStmt/tei:title', NS)
        title = title_element.text if title_element is not None else "Homeric Greek"
        
        html_doc = f"<!DOCTYPE html>\n<html lang='en'>\n<head>\n"
        html_doc += f"    <meta charset='UTF-8'>\n"
        html_doc += f"    <meta name='viewport' content='width=device-width, initial-scale=1.0'>\n"
        html_doc += f"    <title>{html.escape(title)}</title>\n{CSS_STYLE}\n</head>\n<body>\n"
        
        sidebar_header = self.build_sidebar_header()
        table_of_contents = self.build_toc()
        html_doc += f"<nav id='sidebar'>{sidebar_header}{table_of_contents}</nav>\n"
        
        text_element = self.tree.find('.//tei:text', NS)
        main_content_html = self.process_children(text_element) if text_element is not None else ""

        # --- CORRECTED LOGIC ---
        # For the CENTER pane: give all IDs a "center-" prefix, but make all hrefs point to the "right-" pane.
        center_content = re.sub(r'id="([^"]+)"', r'id="center-\1"', main_content_html)
        center_content = re.sub(r'href="#([^"]+)"', r'href="#right-\1"', center_content)
        
        # For the RIGHT pane: give all IDs a "right-" prefix, but make all hrefs point to the "center-" pane.
        right_content = re.sub(r'id="([^"]+)"', r'id="right-\1"', main_content_html)
        right_content = re.sub(r'href="#([^"]+)"', r'href="#center-\1"', right_content)

        html_doc += f"<main id='main-content-center' class='content-pane'>{center_content}</main>\n"
        html_doc += f"<aside id='main-content-right' class='content-pane'>{right_content}</aside>\n"
        
        html_doc += f"{JS_SCRIPT}\n</body>\n</html>"
        return html_doc

    def convert_bak(self):
        """Main conversion method to generate the full HTML document."""
        title_element = self.tree.find('.//tei:titleStmt/tei:title', NS)
        title = title_element.text if title_element is not None else "Homeric Greek"
        
        html_doc = f"<!DOCTYPE html>\n<html lang='en'>\n<head>\n"
        html_doc += f"    <meta charset='UTF-8'>\n"
        html_doc += f"    <meta name='viewport' content='width=device-width, initial-scale=1.0'>\n"
        html_doc += f"    <title>{html.escape(title)}</title>\n{CSS_STYLE}\n</head>\n<body>\n"
        
        sidebar_header = self.build_sidebar_header()
        table_of_contents = self.build_toc()
        html_doc += f"<nav id='sidebar'>{sidebar_header}{table_of_contents}</nav>\n"
        
        text_element = self.tree.find('.//tei:text', NS)
        main_content_html = self.process_children(text_element) if text_element is not None else ""

        center_content = re.sub(r'id="([^"]+)"', r'id="center-\1"', main_content_html)
        center_content = re.sub(r'href="#([^"]+)"', r'href="#center-\1"', center_content)
        
        right_content = re.sub(r'id="([^"]+)"', r'id="right-\1"', main_content_html)
        right_content = re.sub(r'href="#([^"]+)"', r'href="#right-\1"', right_content)

        html_doc += f"<main id='main-content-center' class='content-pane'>{center_content}</main>\n"
        html_doc += f"<aside id='main-content-right' class='content-pane'>{right_content}</aside>\n"
        
        html_doc += f"{JS_SCRIPT}\n</body>\n</html>"
        return html_doc

# --- Main execution block for the Jupyter cell ---
if __name__ == "__main__":
    try:
        if not os.path.exists(XML_FILE):
             raise FileNotFoundError(f"The file '{XML_FILE}' was not found.")
             
        with open(XML_FILE, 'rb') as f:
            parser = etree.XMLParser(remove_blank_text=False, remove_comments=True)
            xml_tree = etree.parse(f, parser)

        converter = TeiToHtmlConverter(xml_tree)
        html_output = converter.convert()

        # Pretty-print the HTML for readability
        #root = etree.HTML(html_output)
        #pretty_html = etree.tostring(root, pretty_print=True, method="html", encoding="unicode")

        #with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        #    f.write(pretty_html)

# The lxml pretty-print round-trip causes unwanted URL encoding in href attributes.
      # By writing the raw html_output directly, we preserve the Unicode characters.
        #with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        #  f.write(html_output)
        #print(f"✅ Successfully converted '{XML_FILE}' and saved it as '{OUTPUT_FILE}'.")

        from bs4 import BeautifulSoup
      
      # Use BeautifulSoup to pretty-print the HTML without encoding href attributes.
        soup = BeautifulSoup(html_output, 'html.parser')
        pretty_html = soup.prettify()

        with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
           f.write(pretty_html)
           print(f"✅ Successfully converted '{XML_FILE}' and saved it as '{OUTPUT_FILE}'.")
    except Exception as e:
        import traceback
        traceback.print_exc()
        print(f"❌ An error occurred: {e}")

✅ Successfully converted 'pharr2025.xml' and saved it as 'homeric_greek_three_pane.html'.


In [61]:
import re

f = open('pharr2025.xml')
outf = open('/Users/gcrane/Downloads/pharr2025-l.xml','w')
lnum = 0
for l in f:
    l = re.sub('\s+$','',l)
    if(re.search('<l>.*[a-z][a-z][a-z]',l)):
        print(l,file=outf)
        print('skp',l)
        continue
    if(re.search('<l>',l) and not re.search('[a-z][a-z]',l)):
        lnum = lnum + 1
        #print('1',l)
        l = re.sub('<l,'<l n="'+str(lnum)+'">', l)
        print(l,file=outf)
        continue
    m = re.search('<l n="([0-9]+)">',l)
    if(m):
        nextlnum = int(m[1])
        if(not nextlnum == lnum + 1):
            print('jump ',lnum,nextlnum,l)
        lnum = nextlnum
        #print('2',l)

    print(l,file=outf)

f.close()
outf.close()

SyntaxError: unterminated string literal (detected at line 15) (2076008137.py, line 15)