pip install nbconvert > /dev/null

pip install nbformat > /dev/null

import os
import nbformat
from nbconvert import HTMLExporter
import re

def convert_ipynb_to_html(ipynb_file):
    # Load the notebook
    with open(ipynb_file, 'r', encoding='utf-8') as f:
        notebook = nbformat.read(f, as_version=4)
    
    # Convert to HTML
    html_exporter = HTMLExporter()
    (body, resources) = html_exporter.from_notebook_node(notebook)
    
    # Define the HTML filename
    html_filename = os.path.splitext(ipynb_file)[0] + '.html'

    if os.path.isfile(html_filename):
        os.remove(html_filename)
    
    # Write the HTML file in the same directory as the ipynb file
    with open(html_filename, 'w', encoding='utf-8') as f:
        f.write(body)
    
    return html_filename

def replace_ipynb_links_in_html(html_file, root_dir):
    with open(html_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Replace .ipynb links with relative .html links
    updated_content = re.sub(
        r'(?<=href=["\'])(.*?\.ipynb)(#.*)?(?=["\'])', 
        lambda match: make_relative_html_link(match.group(1), match.group(2), root_dir), 
        content
    )
    
    # Remove <details> and </details> lines
    if "Everybody_Can_Code" in html_file:
        updated_content = remove_details_tags(updated_content)
    
    # Write the updated content back to the file
    with open(html_file, 'w', encoding='utf-8') as f:
        f.write(updated_content)

def remove_details_tags(html_content):
    """Remove all lines containing <details> or </details> tags."""
    lines = html_content.splitlines()
    filtered_lines = [line for line in lines if '<details>' not in line and '</details>' not in line]
    return '\n'.join(filtered_lines)

def make_relative_html_link(ipynb_link, anchor, root_dir):
    # Convert .ipynb link to corresponding .html link
    html_link = os.path.splitext(ipynb_link)[0] + '.html'
    
    # Get the absolute path of the html link
    absolute_html_path = os.path.abspath(html_link)
    
    # Remove the root directory (CWD) from the absolute path to make it relative
    relative_html_path = os.path.relpath(absolute_html_path, start=root_dir)

    if anchor:
        relative_html_path += anchor
    
    return relative_html_path

def recursive_convert_and_replace_links(root_dir):
    # Recursively find .ipynb files
    for subdir, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.ipynb'):
                ipynb_path = os.path.join(subdir, file)
                
                # Convert to HTML and save it in the same directory
                html_file = convert_ipynb_to_html(ipynb_path)
                
                # Replace .ipynb links with relative .html links, removing the cwd from the path
                replace_ipynb_links_in_html(html_file, root_dir)

def convert_to_linked_html():
    # Use the current working directory as the root directory
    root_directory = os.getcwd()
    
    recursive_convert_and_replace_links(root_directory)

if __name__ == "__main__":
    convert_to_linked_html()

import os
import nbformat
from nbformat.v4 import new_markdown_cell
import re

def load_notebook(path):
    """Load a Jupyter Notebook file."""
    with open(path, 'r', encoding='utf-8') as f:
        return nbformat.read(f, as_version=4)

def extract_links(notebook):
    """Extract links to other .ipynb files from Markdown cells."""
    links = []
    link_pattern = re.compile(r'\[.*?\]\((.*?)\)')
    for cell in notebook.cells:
        if cell.cell_type == 'markdown':
            matches = link_pattern.findall(cell.source)
            for match in matches:
                # Remove URL fragments and query parameters
                link = match.split('#')[0].split('?')[0]
                link = link.strip()
                if link.endswith('.ipynb'):
                    links.append(link)
    return links

def find_file(base_path, relative_path):
    """
    Try to find a file in the given base path or its subdirectories.
    Args:
        base_path: The root directory to search from.
        relative_path: The relative path extracted from a link.
    Returns:
        The resolved absolute path if the file exists, or None.
    """
    potential_paths = [
        os.path.join(base_path, relative_path),
        os.path.join(os.getcwd(), relative_path)
    ]

    # Also check all subdirectories of base_path
    for root, _, files in os.walk(base_path):
        for file in files:
            if os.path.basename(file) == os.path.basename(relative_path):
                potential_paths.append(os.path.join(root, file))

    for path in potential_paths:
        if os.path.exists(path):
            return path
    return None

def adjust_links(source, notebook_dir, combined_dir):
    """Adjust links in Markdown cells and <img> tags to point to the correct locations."""
    # Adjust Markdown links
    def replace_markdown_link(match):
        text = match.group(1)
        link = match.group(2)
        link_clean = link.split('#')[0].split('?')[0]
        if link_clean.endswith('.ipynb'):
            section_name = os.path.splitext(os.path.basename(link_clean))[0]
            adjusted_link = f"#{section_name.replace(' ', '-')}"
            return f"{text}({adjusted_link})"
        else:
            adjusted_path = os.path.relpath(
                os.path.join(notebook_dir, link_clean),
                combined_dir
            )
            return f"{text}({adjusted_path})"

    # Adjust <img> tags
    def replace_img_tag(match):
        src = match.group(1)
        src_clean = src.split('#')[0].split('?')[0]
        adjusted_path = os.path.relpath(
            os.path.join(notebook_dir, src_clean),
            combined_dir
        )
        return f'<img src="{adjusted_path}"'

    # Apply adjustments
    markdown_pattern = re.compile(r'(\[.*?\])\((.*?)\)')
    img_pattern = re.compile(r'<img\s+src=["\'](.*?)["\']')

    adjusted_source = markdown_pattern.sub(replace_markdown_link, source)
    adjusted_source = img_pattern.sub(replace_img_tag, adjusted_source)
    return adjusted_source

def combine_notebooks(base_path, notebook_path, visited=None, combined=None, combined_dir=None):
    """
    Combine notebooks into one, following links recursively.

    Args:
        base_path: The base directory containing all notebooks.
        notebook_path: Path to the current notebook relative to base_path.
        visited: Set of already visited notebooks to avoid duplication.
        combined: Combined notebook object.
        combined_dir: Directory where the combined notebook will be saved.
    """
    if visited is None:
        visited = set()
    if combined is None:
        combined = nbformat.v4.new_notebook()

    if combined_dir is None:
        combined_dir = base_path

    resolved_path = find_file(base_path, notebook_path)
    if resolved_path is None:
        print(f"Warning: File {notebook_path} from {base_path} not found. Skipping.")
        return combined

    notebook_key = os.path.relpath(resolved_path, base_path)
    if notebook_key in visited:
        return combined  # Avoid processing the same notebook twice

    visited.add(notebook_key)

    # Load the notebook
    notebook = load_notebook(resolved_path)
    notebook_dir = os.path.dirname(resolved_path)

    # Add a heading to separate notebooks in the combined file
    section_name = os.path.splitext(os.path.basename(notebook_path))[0]
    page_break = """<style>
@media print {
/* Erzwingt einen Seitenumbruch vor diesem Abschnitt */
.page-break-before {
    page-break-before: always; /* Alternative: break-before: page; */
}
}
</style>

<div class="page-break-before">
                                    
"""

    combined.cells.append(new_markdown_cell(f"\n{page_break}\n\n##### {section_name}\n\n---\n</div>\n"))

    # Add cells from the current notebook
    for cell in notebook.cells:
        cell_copy = nbformat.from_dict(cell)
        if cell.cell_type == 'markdown':
            # Adjust links in Markdown cells and <img> tags
            cell_copy.source = adjust_links(cell_copy.source, notebook_dir, combined_dir)
        combined.cells.append(cell_copy)

    # Find and process links to other notebooks
    links = extract_links(notebook)
    for link in links:
        combine_notebooks(base_path, link, visited, combined, combined_dir)

    return combined

def depth_first_combining():
    # Use the script's current working directory as the base path
    base_path = os.getcwd()  # Current working directory where the script is executed
    initial_notebook = "abstract/Contents.de.ipynb"  # Replace with the initial notebook's path relative to base_path

    combined_notebook = combine_notebooks(base_path, initial_notebook)

    # Save the combined notebook
    output_path = os.path.join(base_path, "Everybody_Can_Code.ipynb")
    with open(output_path, 'w', encoding='utf-8') as f:
        nbformat.write(combined_notebook, f)

    print(f"Combined notebook saved to {output_path}")

if __name__ == "__main__":
    depth_first_combining()

Combined notebook saved to /Users/starkj/Documents/2hands/Soproming/Repo/JederKannCoden/notebooks/Everybody_Can_Code.ipynb

/Users/starkj/Library/Python/3.9/lib/python/site-packages/nbformat/__init__.py:132: MissingIDFieldWarning: Cell is missing an id field, this will become a hard error in future nbformat versions. You may want to use `normalize()` on your notebooks before validations (available since nbformat 5.1.4). Previous versions of nbformat are fixing this issue transparently, and will stop doing so in the future.
  validate(nb)
/Users/starkj/Library/Python/3.9/lib/python/site-packages/nbformat/__init__.py:132: DuplicateCellId: Non-unique cell id '3adfadcf' detected. Corrected to '57475387'.
  validate(nb)
/Users/starkj/Library/Python/3.9/lib/python/site-packages/nbformat/__init__.py:132: DuplicateCellId: Non-unique cell id '3adfadcf' detected. Corrected to 'f94e6526'.
  validate(nb)
/Users/starkj/Library/Python/3.9/lib/python/site-packages/nbformat/__init__.py:132: DuplicateCellId: Non-unique cell id 'd7d9b20a' detected. Corrected to 'e59303f8'.
  validate(nb)

import os
import nbformat
from nbformat.v4 import new_markdown_cell
import re

def load_notebook(path):
    """Load a Jupyter Notebook file."""
    with open(path, 'r', encoding='utf-8') as f:
        return nbformat.read(f, as_version=4)

def extract_links(notebook):
    """Extract links to other .ipynb files from Markdown cells."""
    links = []
    link_pattern = re.compile(r'\[.*?\]\((.*?)\)')
    for cell in notebook.cells:
        if cell.cell_type == 'markdown':
            matches = link_pattern.findall(cell.source)
            for match in matches:
                link = match.split('#')[0].split('?')[0].strip()
                if link.endswith('.ipynb'):
                    links.append(link)
    return links

def find_file(base_path, relative_path):
    """
    Try to find a file in the given base path or its subdirectories.
    """
    for root, _, files in os.walk(base_path):
        for file in files:
            if file == os.path.basename(relative_path):
                return os.path.join(root, file)
    return None

def adjust_links(source, notebook_dir, combined_dir):
    """Adjust links in Markdown cells and <img> tags to point to the correct locations."""
    def replace_markdown_link(match):
        text, link = match.groups()
        link_clean = link.split('#')[0].split('?')[0]
        if link_clean.endswith('.ipynb'):
            section_name = os.path.splitext(os.path.basename(link_clean))[0]
            return f"{text}(#{section_name.replace(' ', '-')})"
        else:
            adjusted_path = os.path.relpath(os.path.join(notebook_dir, link_clean), combined_dir)
            return f"{text}({adjusted_path})"

    def replace_img_tag(match):
        src = match.group(1)
        adjusted_path = os.path.relpath(os.path.join(notebook_dir, src), combined_dir)
        return f'<img src="{adjusted_path}"'

    markdown_pattern = re.compile(r'(\[.*?\])\((.*?)\)')
    img_pattern = re.compile(r'<img\s+src=["\'](.*?)["\']')

    adjusted_source = markdown_pattern.sub(replace_markdown_link, source)
    adjusted_source = img_pattern.sub(replace_img_tag, adjusted_source)
    return adjusted_source

def get_link_data(base_path, notebook_path):
    resolved_path = find_file(base_path, notebook_path)
    if resolved_path is None:
        print(f"Warning: File {notebook_path} not found. Skipping.")
        return "", ""

    notebook_key = os.path.relpath(resolved_path, base_path)

    return (notebook_key, resolved_path)

def collect_links(base_path, notebook_path, visited=None, added=None):
    """
    Collect all links to notebooks in the order they appear.
    """
    if visited is None:
        visited = set()
    if added is None:
        added = set()
    
    notebook_key, resolved_path = get_link_data(base_path, notebook_path)

    if notebook_key == "":
        return []
    
    if notebook_key in visited:
        return []

    visited.add(notebook_key)
    
    notebook = load_notebook(resolved_path)
    links = extract_links(notebook)

    all_links=[]

    if notebook_key not in added:
        all_links = [(notebook_key, resolved_path)]
        added.add(notebook_key)

    for link in links:
        notebook_key, resolved_path = get_link_data(base_path, link)
        if notebook_key not in added:
            all_links.extend([(notebook_key, resolved_path)])
            added.add(notebook_key)
            
    for link in links:
        all_links.extend(collect_links(base_path, link, visited, added))
    
    return all_links

def combine_notebooks(base_path, links, combined_dir=None):
    """
    Combine all notebooks into one in the order of provided links.
    """
    combined = nbformat.v4.new_notebook()

    if combined_dir is None:
        combined_dir = base_path

    for notebook_key, resolved_path in links:
        notebook = load_notebook(resolved_path)
        notebook_dir = os.path.dirname(resolved_path)

        section_name = os.path.splitext(os.path.basename(notebook_key))[0]
        page_break = """<style>
@media print {
/* Erzwingt einen Seitenumbruch vor diesem Abschnitt */
.page-break-before {
    page-break-before: always; /* Alternative: break-before: page; */
}
}
</style>

<div class="page-break-before">
                                    
"""

        combined.cells.append(new_markdown_cell(f"\n{page_break}\n\n##### {section_name}\n\n---\n</div>\n"))
#        combined.cells.append(new_markdown_cell(f"### {section_name}\n\n---\n"))

        for cell in notebook.cells:
            cell_copy = nbformat.from_dict(cell)
            if cell.cell_type == 'markdown':
                cell_copy.source = adjust_links(cell_copy.source, notebook_dir, combined_dir)
            combined.cells.append(cell_copy)
    
    return combined

def breadth_first_combining():
    base_path = os.getcwd()  # Current working directory
    initial_notebook = "abstract/Intro.de.ipynb"  # Replace with the initial notebook's path relative to base_path

    links = collect_links(base_path, initial_notebook)
    combined_notebook = combine_notebooks(base_path, links)

    output_path = os.path.join(base_path, "Everybody_Can_Code.ipynb")
    with open(output_path, 'w', encoding='utf-8') as f:
        nbformat.write(combined_notebook, f)

    print(f"Combined notebook saved to {output_path}")

if __name__ == "__main__":
    breadth_first_combining()

Warning: File ../../tools/Git.de.ipynb not found. Skipping.
Warning: File ../../tools/Git.de.ipynb not found. Skipping.
Warning: File ../programming/csharp/Introduction_csharp.de.ipynb not found. Skipping.
Warning: File ../programming/csharp/Introduction_csharp.de.ipynb not found. Skipping.

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[9], line 159
    156     print(f"Combined notebook saved to {output_path}")
    158 if __name__ == "__main__":
--> 159     breadth_first_combining()

Cell In[9], line 150, in breadth_first_combining()
    147 initial_notebook = "abstract/Intro.de.ipynb"  # Replace with the initial notebook's path relative to base_path
    149 links = collect_links(base_path, initial_notebook)
--> 150 combined_notebook = combine_notebooks(base_path, links)
    152 output_path = os.path.join(base_path, "Everybody_Can_Code.ipynb")
    153 with open(output_path, 'w', encoding='utf-8') as f:

Cell In[9], line 117, in combine_notebooks(base_path, links, combined_dir)
    114     combined_dir = base_path
    116 for notebook_key, resolved_path in links:
--> 117     notebook = load_notebook(resolved_path)
    118     notebook_dir = os.path.dirname(resolved_path)
    120     section_name = os.path.splitext(os.path.basename(notebook_key))[0]

Cell In[9], line 8, in load_notebook(path)
      6 def load_notebook(path):
      7     """Load a Jupyter Notebook file."""
----> 8     with open(path, 'r', encoding='utf-8') as f:
      9         return nbformat.read(f, as_version=4)

File /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/IPython/core/interactiveshell.py:310, in _modified_open(file, *args, **kwargs)
    303 if file in {0, 1, 2}:
    304     raise ValueError(
    305         f"IPython won't let you open fd={file} by default "
    306         "as it is likely to crash IPython. If you know what you are doing, "
    307         "you can use builtins' open."
    308     )
--> 310 return io_open(file, *args, **kwargs)

FileNotFoundError: [Errno 2] No such file or directory: ''

!/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --headless --print-to-pdf-no-header --no-margins --window-size=1280,1024 --print-to-pdf="output.pdf" programming/Programmieren1.de.html

!brew install wkhtmltopdf

!wkhtmltopdf --enable-local-file-access --no-stop-slow-scripts --print-media-type programming/Programmieren1.de.html output.pdf

pip install pdfkit bs4 requests pypdf2

import os
from bs4 import BeautifulSoup
import pdfkit
from PyPDF2 import PdfMerger

def get_all_local_links(html_file, base_folder):
    """
    Extracts all local links to other HTML files within the same folder.
    """
    links = set()
    with open(html_file, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            if href.endswith('.html') and os.path.exists(os.path.join(base_folder, href)):
                links.add(os.path.join(base_folder, href))
    return list(links)

def save_as_pdf(input_html, output_pdf):
    """
    Converts a local HTML file to a PDF.
    """
    options = {
        'enable-local-file-access': None,
        'quiet': ''
    }
    pdfkit.from_file(input_html, output_pdf, options=options)

def create_pdf_with_toc(main_html, base_folder, output_pdf):
    """
    Main function to create a merged PDF with a table of contents.
    """
    visited = set()  # Track visited HTML files to avoid duplicates
    links_to_visit = [os.path.join(base_folder, main_html)]
    merger = PdfMerger()  # PdfMerger to combine PDFs

    page_number = 1
    while links_to_visit:
        html_file = links_to_visit.pop(0)
        if html_file in visited:
            continue
        visited.add(html_file)
        
        pdf_path = f"page_{page_number}.pdf"
        print(f"Converting: {html_file}")
        save_as_pdf(html_file, pdf_path)  # Convert each HTML to a PDF
        merger.append(pdf_path)  # Append PDF to the merger
        merger.addBookmark(f"Page {page_number}: {os.path.basename(html_file)}", page_number - 1)  # Add bookmark
        
        # Find additional links within the current HTML file
        new_links = get_all_local_links(html_file, base_folder)
        for link in new_links:
            if link not in visited:
                links_to_visit.append(link)

        page_number += 1

    merger.write(output_pdf)
    merger.close()
    print(f"Merged PDF successfully created as '{output_pdf}'")

# Example usage:
# Assume `base_folder` contains the main HTML file and all linked HTML files
base_folder = "./programming"
main_html = "Programmieren1.de.html"  # Main HTML file to start with
output_pdf = "merged_document.pdf"
create_pdf_with_toc(main_html, base_folder, output_pdf)

!pandoc -s programming/Programmieren1.de.html -o output.pdf --toc

pip install beautifulsoup4 pdfkit

import os
from bs4 import BeautifulSoup
import pdfkit

# Function to extract headlines from HTML
def extract_headlines(html_content, file_index):
    soup = BeautifulSoup(html_content, 'html.parser')
    headlines = []
    for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
        for header in soup.find_all(tag):
            # Create an anchor link for each headline
            anchor = f"section_{file_index}_{len(headlines)}"
            header['id'] = anchor
            headlines.append((header.text.strip(), tag, anchor))
    return headlines, str(soup)

# Function to gather all HTML files recursively
def gather_html_files(directory):
    html_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.ipynb.html'):
                html_files.append(os.path.join(root, file))
    return sorted(html_files)  # Sorting ensures the order is preserved

# Specify the top-level folder containing the HTML files
top_level_folder = 'csharp'
output_html = 'combined_with_toc.html'

# Gather all HTML files recursively
html_files = gather_html_files(top_level_folder)

toc_entries = []
full_html_content = "<html><head><title>Document with TOC</title></head><body>"

# Generate the TOC and combine HTML files
for i, filepath in enumerate(html_files):
    with open(filepath, 'r', encoding='utf-8') as infile:
        content = infile.read()
        headlines, updated_html = extract_headlines(content, i)
        toc_entries.extend(headlines)
        full_html_content += updated_html
        full_html_content += '<div style="page-break-after: always;"></div>'

# Create the TOC HTML structure
toc_html = '<h1>Table of Contents</h1><ul>'
for text, tag, anchor in toc_entries:
    toc_html += f'<li><a href="#{anchor}">{text}</a></li>'
toc_html += '</ul><div style="page-break-after: always;"></div>'

# Add the TOC to the beginning of the document
full_html_content = toc_html + full_html_content + "</body></html>"

# Write the combined HTML with TOC to a file
with open(output_html, 'w', encoding='utf-8') as outfile:
    outfile.write(full_html_content)

# Convert the combined HTML file with TOC to PDF
pdfkit.from_file(output_html, 'output_with_toc.pdf')

print("PDF with Table of Contents has been generated as 'output_with_toc.pdf'.")

Create portable version¶

Recursive HTML exporting and link replacement¶

Prerequesits¶

Install the Required Modules¶

Script: Convert and Modify Links in HTML¶

How the Script Works:¶

Notes:¶

Combine all notebooks to one¶

Depth-first approach¶

Same functionality but going breadth-first¶

Create pdf with chrome headless¶

Create pdf with wkhtmltopdf¶

Create with pdfkit¶

Create PDF with pandoc¶

Create PDF with beautiful soup and pdfkit¶

1. Extract Headlines from HTML Files:¶

2. Create the Table of Contents:¶

3. Insert the TOC at the Beginning of the Combined HTML File:¶

4. Convert to PDF:¶

Step 1: Install Necessary Libraries¶

Step 2: Python Script to Generate TOC and Convert HTML to PDF¶

Requirements:¶