Create portable version¶

This page describes and actually impliments, how to make the jupyter notebooks of the whole project portable.
Recursive HTML exporting and link replacement¶
This one is in use right now to create linked and working html pages.
To achieve this, you can write a Python script that:
- Recursively searches for
.ipynb
files in a directory. - Exports each
.ipynb
file to an HTML file. - Replaces links within the HTML files that point to
.ipynb
files with.html
links (so that the links work correctly in a browser).
You can use the following libraries:
os
orpathlib
for file traversal.nbconvert
for converting.ipynb
to.html
.re
for regular expressions to replace links in HTML content.
Prerequesits¶
Install the Required Modules¶
You can install the necessary modules using the following commands in your terminal or command prompt.
- Install
nbconvert
: This module is used to convert Jupyter notebooks to HTML.
pip install nbconvert > /dev/null
- Install
nbformat
: This module is used to read and write Jupyter notebook files.
pip install nbformat > /dev/null
Here's an outline of the script:
Script: Convert and Modify Links in HTML¶
import os
import nbformat
from nbconvert import HTMLExporter
import re
def convert_ipynb_to_html(ipynb_file):
# Load the notebook
with open(ipynb_file, 'r', encoding='utf-8') as f:
notebook = nbformat.read(f, as_version=4)
# Convert to HTML
html_exporter = HTMLExporter()
(body, resources) = html_exporter.from_notebook_node(notebook)
# Define the HTML filename
html_filename = os.path.splitext(ipynb_file)[0] + '.html'
if os.path.isfile(html_filename):
os.remove(html_filename)
# Write the HTML file in the same directory as the ipynb file
with open(html_filename, 'w', encoding='utf-8') as f:
f.write(body)
return html_filename
def replace_ipynb_links_in_html(html_file, root_dir):
with open(html_file, 'r', encoding='utf-8') as f:
content = f.read()
# Replace .ipynb links with relative .html links
updated_content = re.sub(
r'(?<=href=["\'])(.*?\.ipynb)(#.*)?(?=["\'])',
lambda match: make_relative_html_link(match.group(1), match.group(2), root_dir),
content
)
# Remove <details> and </details> lines
if "Everybody_Can_Code" in html_file:
updated_content = remove_details_tags(updated_content)
# Write the updated content back to the file
with open(html_file, 'w', encoding='utf-8') as f:
f.write(updated_content)
def remove_details_tags(html_content):
"""Remove all lines containing <details> or </details> tags."""
lines = html_content.splitlines()
filtered_lines = [line for line in lines if '<details>' not in line and '</details>' not in line]
return '\n'.join(filtered_lines)
def make_relative_html_link(ipynb_link, anchor, root_dir):
# Convert .ipynb link to corresponding .html link
html_link = os.path.splitext(ipynb_link)[0] + '.html'
# Get the absolute path of the html link
absolute_html_path = os.path.abspath(html_link)
# Remove the root directory (CWD) from the absolute path to make it relative
relative_html_path = os.path.relpath(absolute_html_path, start=root_dir)
if anchor:
relative_html_path += anchor
return relative_html_path
def recursive_convert_and_replace_links(root_dir):
# Recursively find .ipynb files
for subdir, dirs, files in os.walk(root_dir):
for file in files:
if file.endswith('.ipynb'):
ipynb_path = os.path.join(subdir, file)
# Convert to HTML and save it in the same directory
html_file = convert_ipynb_to_html(ipynb_path)
# Replace .ipynb links with relative .html links, removing the cwd from the path
replace_ipynb_links_in_html(html_file, root_dir)
def convert_to_linked_html():
# Use the current working directory as the root directory
root_directory = os.getcwd()
recursive_convert_and_replace_links(root_directory)
if __name__ == "__main__":
convert_to_linked_html()
How the Script Works:¶
Convert
.ipynb
to.html
:- The
convert_ipynb_to_html
function usesnbconvert
to convert a Jupyter Notebook (.ipynb
) to an HTML file and saves it in theoutput_dir
.
- The
Replace
.ipynb
Links with.html
Links:- The
replace_ipynb_links_in_html
function reads the generated HTML file, finds any links to.ipynb
files, and replaces them with.html
links. - This is done using a regular expression that matches the links in the HTML content.
- The
Recursively Process Files:
- The
recursive_convert_and_replace_links
function walks through the directory tree (os.walk
) and processes all.ipynb
files. It calls the conversion and replacement functions for each file found.
- The
Notes:¶
- File Paths: Make sure to update the
root_directory
andoutput_directory
variables to match your desired input and output paths. - Dependencies: You may need to install
nbconvert
andnbformat
viapip install nbconvert nbformat
.
This script will help you convert all .ipynb
files to HTML and ensure that the links between notebooks are correctly pointing to the corresponding .html
files.
import os
import nbformat
from nbformat.v4 import new_markdown_cell
import re
def load_notebook(path):
"""Load a Jupyter Notebook file."""
with open(path, 'r', encoding='utf-8') as f:
return nbformat.read(f, as_version=4)
def extract_links(notebook):
"""Extract links to other .ipynb files from Markdown cells."""
links = []
link_pattern = re.compile(r'\[.*?\]\((.*?)\)')
for cell in notebook.cells:
if cell.cell_type == 'markdown':
matches = link_pattern.findall(cell.source)
for match in matches:
# Remove URL fragments and query parameters
link = match.split('#')[0].split('?')[0]
link = link.strip()
if link.endswith('.ipynb'):
links.append(link)
return links
def find_file(base_path, relative_path):
"""
Try to find a file in the given base path or its subdirectories.
Args:
base_path: The root directory to search from.
relative_path: The relative path extracted from a link.
Returns:
The resolved absolute path if the file exists, or None.
"""
potential_paths = [
os.path.join(base_path, relative_path),
os.path.join(os.getcwd(), relative_path)
]
# Also check all subdirectories of base_path
for root, _, files in os.walk(base_path):
for file in files:
if os.path.basename(file) == os.path.basename(relative_path):
potential_paths.append(os.path.join(root, file))
for path in potential_paths:
if os.path.exists(path):
return path
return None
def adjust_links(source, notebook_dir, combined_dir):
"""Adjust links in Markdown cells and <img> tags to point to the correct locations."""
# Adjust Markdown links
def replace_markdown_link(match):
text = match.group(1)
link = match.group(2)
link_clean = link.split('#')[0].split('?')[0]
if link_clean.endswith('.ipynb'):
section_name = os.path.splitext(os.path.basename(link_clean))[0]
adjusted_link = f"#{section_name.replace(' ', '-')}"
return f"{text}({adjusted_link})"
else:
adjusted_path = os.path.relpath(
os.path.join(notebook_dir, link_clean),
combined_dir
)
return f"{text}({adjusted_path})"
# Adjust <img> tags
def replace_img_tag(match):
src = match.group(1)
src_clean = src.split('#')[0].split('?')[0]
adjusted_path = os.path.relpath(
os.path.join(notebook_dir, src_clean),
combined_dir
)
return f'<img src="{adjusted_path}"'
# Apply adjustments
markdown_pattern = re.compile(r'(\[.*?\])\((.*?)\)')
img_pattern = re.compile(r'<img\s+src=["\'](.*?)["\']')
adjusted_source = markdown_pattern.sub(replace_markdown_link, source)
adjusted_source = img_pattern.sub(replace_img_tag, adjusted_source)
return adjusted_source
def combine_notebooks(base_path, notebook_path, visited=None, combined=None, combined_dir=None):
"""
Combine notebooks into one, following links recursively.
Args:
base_path: The base directory containing all notebooks.
notebook_path: Path to the current notebook relative to base_path.
visited: Set of already visited notebooks to avoid duplication.
combined: Combined notebook object.
combined_dir: Directory where the combined notebook will be saved.
"""
if visited is None:
visited = set()
if combined is None:
combined = nbformat.v4.new_notebook()
if combined_dir is None:
combined_dir = base_path
resolved_path = find_file(base_path, notebook_path)
if resolved_path is None:
print(f"Warning: File {notebook_path} from {base_path} not found. Skipping.")
return combined
notebook_key = os.path.relpath(resolved_path, base_path)
if notebook_key in visited:
return combined # Avoid processing the same notebook twice
visited.add(notebook_key)
# Load the notebook
notebook = load_notebook(resolved_path)
notebook_dir = os.path.dirname(resolved_path)
# Add a heading to separate notebooks in the combined file
section_name = os.path.splitext(os.path.basename(notebook_path))[0]
page_break = """<style>
@media print {
/* Erzwingt einen Seitenumbruch vor diesem Abschnitt */
.page-break-before {
page-break-before: always; /* Alternative: break-before: page; */
}
}
</style>
<div class="page-break-before">
"""
combined.cells.append(new_markdown_cell(f"\n{page_break}\n\n##### {section_name}\n\n---\n</div>\n"))
# Add cells from the current notebook
for cell in notebook.cells:
cell_copy = nbformat.from_dict(cell)
if cell.cell_type == 'markdown':
# Adjust links in Markdown cells and <img> tags
cell_copy.source = adjust_links(cell_copy.source, notebook_dir, combined_dir)
combined.cells.append(cell_copy)
# Find and process links to other notebooks
links = extract_links(notebook)
for link in links:
combine_notebooks(base_path, link, visited, combined, combined_dir)
return combined
def depth_first_combining():
# Use the script's current working directory as the base path
base_path = os.getcwd() # Current working directory where the script is executed
initial_notebook = "abstract/Contents.de.ipynb" # Replace with the initial notebook's path relative to base_path
combined_notebook = combine_notebooks(base_path, initial_notebook)
# Save the combined notebook
output_path = os.path.join(base_path, "Everybody_Can_Code.ipynb")
with open(output_path, 'w', encoding='utf-8') as f:
nbformat.write(combined_notebook, f)
print(f"Combined notebook saved to {output_path}")
if __name__ == "__main__":
depth_first_combining()
Combined notebook saved to /Users/starkj/Documents/2hands/Soproming/Repo/JederKannCoden/notebooks/Everybody_Can_Code.ipynb
/Users/starkj/Library/Python/3.9/lib/python/site-packages/nbformat/__init__.py:132: MissingIDFieldWarning: Cell is missing an id field, this will become a hard error in future nbformat versions. You may want to use `normalize()` on your notebooks before validations (available since nbformat 5.1.4). Previous versions of nbformat are fixing this issue transparently, and will stop doing so in the future. validate(nb) /Users/starkj/Library/Python/3.9/lib/python/site-packages/nbformat/__init__.py:132: DuplicateCellId: Non-unique cell id '3adfadcf' detected. Corrected to '57475387'. validate(nb) /Users/starkj/Library/Python/3.9/lib/python/site-packages/nbformat/__init__.py:132: DuplicateCellId: Non-unique cell id '3adfadcf' detected. Corrected to 'f94e6526'. validate(nb) /Users/starkj/Library/Python/3.9/lib/python/site-packages/nbformat/__init__.py:132: DuplicateCellId: Non-unique cell id 'd7d9b20a' detected. Corrected to 'e59303f8'. validate(nb)
Same functionality but going breadth-first¶
import os
import nbformat
from nbformat.v4 import new_markdown_cell
import re
def load_notebook(path):
"""Load a Jupyter Notebook file."""
with open(path, 'r', encoding='utf-8') as f:
return nbformat.read(f, as_version=4)
def extract_links(notebook):
"""Extract links to other .ipynb files from Markdown cells."""
links = []
link_pattern = re.compile(r'\[.*?\]\((.*?)\)')
for cell in notebook.cells:
if cell.cell_type == 'markdown':
matches = link_pattern.findall(cell.source)
for match in matches:
link = match.split('#')[0].split('?')[0].strip()
if link.endswith('.ipynb'):
links.append(link)
return links
def find_file(base_path, relative_path):
"""
Try to find a file in the given base path or its subdirectories.
"""
for root, _, files in os.walk(base_path):
for file in files:
if file == os.path.basename(relative_path):
return os.path.join(root, file)
return None
def adjust_links(source, notebook_dir, combined_dir):
"""Adjust links in Markdown cells and <img> tags to point to the correct locations."""
def replace_markdown_link(match):
text, link = match.groups()
link_clean = link.split('#')[0].split('?')[0]
if link_clean.endswith('.ipynb'):
section_name = os.path.splitext(os.path.basename(link_clean))[0]
return f"{text}(#{section_name.replace(' ', '-')})"
else:
adjusted_path = os.path.relpath(os.path.join(notebook_dir, link_clean), combined_dir)
return f"{text}({adjusted_path})"
def replace_img_tag(match):
src = match.group(1)
adjusted_path = os.path.relpath(os.path.join(notebook_dir, src), combined_dir)
return f'<img src="{adjusted_path}"'
markdown_pattern = re.compile(r'(\[.*?\])\((.*?)\)')
img_pattern = re.compile(r'<img\s+src=["\'](.*?)["\']')
adjusted_source = markdown_pattern.sub(replace_markdown_link, source)
adjusted_source = img_pattern.sub(replace_img_tag, adjusted_source)
return adjusted_source
def get_link_data(base_path, notebook_path):
resolved_path = find_file(base_path, notebook_path)
if resolved_path is None:
print(f"Warning: File {notebook_path} not found. Skipping.")
return "", ""
notebook_key = os.path.relpath(resolved_path, base_path)
return (notebook_key, resolved_path)
def collect_links(base_path, notebook_path, visited=None, added=None):
"""
Collect all links to notebooks in the order they appear.
"""
if visited is None:
visited = set()
if added is None:
added = set()
notebook_key, resolved_path = get_link_data(base_path, notebook_path)
if notebook_key == "":
return []
if notebook_key in visited:
return []
visited.add(notebook_key)
notebook = load_notebook(resolved_path)
links = extract_links(notebook)
all_links=[]
if notebook_key not in added:
all_links = [(notebook_key, resolved_path)]
added.add(notebook_key)
for link in links:
notebook_key, resolved_path = get_link_data(base_path, link)
if notebook_key not in added:
all_links.extend([(notebook_key, resolved_path)])
added.add(notebook_key)
for link in links:
all_links.extend(collect_links(base_path, link, visited, added))
return all_links
def combine_notebooks(base_path, links, combined_dir=None):
"""
Combine all notebooks into one in the order of provided links.
"""
combined = nbformat.v4.new_notebook()
if combined_dir is None:
combined_dir = base_path
for notebook_key, resolved_path in links:
notebook = load_notebook(resolved_path)
notebook_dir = os.path.dirname(resolved_path)
section_name = os.path.splitext(os.path.basename(notebook_key))[0]
page_break = """<style>
@media print {
/* Erzwingt einen Seitenumbruch vor diesem Abschnitt */
.page-break-before {
page-break-before: always; /* Alternative: break-before: page; */
}
}
</style>
<div class="page-break-before">
"""
combined.cells.append(new_markdown_cell(f"\n{page_break}\n\n##### {section_name}\n\n---\n</div>\n"))
# combined.cells.append(new_markdown_cell(f"### {section_name}\n\n---\n"))
for cell in notebook.cells:
cell_copy = nbformat.from_dict(cell)
if cell.cell_type == 'markdown':
cell_copy.source = adjust_links(cell_copy.source, notebook_dir, combined_dir)
combined.cells.append(cell_copy)
return combined
def breadth_first_combining():
base_path = os.getcwd() # Current working directory
initial_notebook = "abstract/Intro.de.ipynb" # Replace with the initial notebook's path relative to base_path
links = collect_links(base_path, initial_notebook)
combined_notebook = combine_notebooks(base_path, links)
output_path = os.path.join(base_path, "Everybody_Can_Code.ipynb")
with open(output_path, 'w', encoding='utf-8') as f:
nbformat.write(combined_notebook, f)
print(f"Combined notebook saved to {output_path}")
if __name__ == "__main__":
breadth_first_combining()
Warning: File ../../tools/Git.de.ipynb not found. Skipping. Warning: File ../../tools/Git.de.ipynb not found. Skipping. Warning: File ../programming/csharp/Introduction_csharp.de.ipynb not found. Skipping. Warning: File ../programming/csharp/Introduction_csharp.de.ipynb not found. Skipping.
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[9], line 159 156 print(f"Combined notebook saved to {output_path}") 158 if __name__ == "__main__": --> 159 breadth_first_combining() Cell In[9], line 150, in breadth_first_combining() 147 initial_notebook = "abstract/Intro.de.ipynb" # Replace with the initial notebook's path relative to base_path 149 links = collect_links(base_path, initial_notebook) --> 150 combined_notebook = combine_notebooks(base_path, links) 152 output_path = os.path.join(base_path, "Everybody_Can_Code.ipynb") 153 with open(output_path, 'w', encoding='utf-8') as f: Cell In[9], line 117, in combine_notebooks(base_path, links, combined_dir) 114 combined_dir = base_path 116 for notebook_key, resolved_path in links: --> 117 notebook = load_notebook(resolved_path) 118 notebook_dir = os.path.dirname(resolved_path) 120 section_name = os.path.splitext(os.path.basename(notebook_key))[0] Cell In[9], line 8, in load_notebook(path) 6 def load_notebook(path): 7 """Load a Jupyter Notebook file.""" ----> 8 with open(path, 'r', encoding='utf-8') as f: 9 return nbformat.read(f, as_version=4) File /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/IPython/core/interactiveshell.py:310, in _modified_open(file, *args, **kwargs) 303 if file in {0, 1, 2}: 304 raise ValueError( 305 f"IPython won't let you open fd={file} by default " 306 "as it is likely to crash IPython. If you know what you are doing, " 307 "you can use builtins' open." 308 ) --> 310 return io_open(file, *args, **kwargs) FileNotFoundError: [Errno 2] No such file or directory: ''
Create pdf with chrome headless¶
WiP
!/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --headless --print-to-pdf-no-header --no-margins --window-size=1280,1024 --print-to-pdf="output.pdf" programming/Programmieren1.de.html
Create pdf with wkhtmltopdf¶
WiP
!brew install wkhtmltopdf
!wkhtmltopdf --enable-local-file-access --no-stop-slow-scripts --print-media-type programming/Programmieren1.de.html output.pdf
Create with pdfkit¶
WiP
pip install pdfkit bs4 requests pypdf2
import os
from bs4 import BeautifulSoup
import pdfkit
from PyPDF2 import PdfMerger
def get_all_local_links(html_file, base_folder):
"""
Extracts all local links to other HTML files within the same folder.
"""
links = set()
with open(html_file, 'r', encoding='utf-8') as file:
soup = BeautifulSoup(file, 'html.parser')
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
if href.endswith('.html') and os.path.exists(os.path.join(base_folder, href)):
links.add(os.path.join(base_folder, href))
return list(links)
def save_as_pdf(input_html, output_pdf):
"""
Converts a local HTML file to a PDF.
"""
options = {
'enable-local-file-access': None,
'quiet': ''
}
pdfkit.from_file(input_html, output_pdf, options=options)
def create_pdf_with_toc(main_html, base_folder, output_pdf):
"""
Main function to create a merged PDF with a table of contents.
"""
visited = set() # Track visited HTML files to avoid duplicates
links_to_visit = [os.path.join(base_folder, main_html)]
merger = PdfMerger() # PdfMerger to combine PDFs
page_number = 1
while links_to_visit:
html_file = links_to_visit.pop(0)
if html_file in visited:
continue
visited.add(html_file)
pdf_path = f"page_{page_number}.pdf"
print(f"Converting: {html_file}")
save_as_pdf(html_file, pdf_path) # Convert each HTML to a PDF
merger.append(pdf_path) # Append PDF to the merger
merger.addBookmark(f"Page {page_number}: {os.path.basename(html_file)}", page_number - 1) # Add bookmark
# Find additional links within the current HTML file
new_links = get_all_local_links(html_file, base_folder)
for link in new_links:
if link not in visited:
links_to_visit.append(link)
page_number += 1
merger.write(output_pdf)
merger.close()
print(f"Merged PDF successfully created as '{output_pdf}'")
# Example usage:
# Assume `base_folder` contains the main HTML file and all linked HTML files
base_folder = "./programming"
main_html = "Programmieren1.de.html" # Main HTML file to start with
output_pdf = "merged_document.pdf"
create_pdf_with_toc(main_html, base_folder, output_pdf)
Create PDF with pandoc¶
WiP
!pandoc -s programming/Programmieren1.de.html -o output.pdf --toc
Create PDF with beautiful soup and pdfkit¶
Does not work so well.
1. Extract Headlines from HTML Files:¶
- You'll need to parse the HTML files to extract the headline tags (
<h1>
,<h2>
, etc.) and use them to generate the table of contents. - The
BeautifulSoup
library frombs4
is perfect for parsing HTML.
2. Create the Table of Contents:¶
- Use the extracted headlines to create a TOC in HTML format, with links to the corresponding sections.
3. Insert the TOC at the Beginning of the Combined HTML File:¶
- Add the generated TOC to the beginning of your combined HTML file before converting it to PDF.
4. Convert to PDF:¶
- Once you have the HTML with the TOC, convert it to PDF as before.
Here’s a Python script that implements these steps:
Step 1: Install Necessary Libraries¶
You’ll need to install the required libraries:
pip install beautifulsoup4 pdfkit
Step 2: Python Script to Generate TOC and Convert HTML to PDF¶
import os
from bs4 import BeautifulSoup
import pdfkit
# Function to extract headlines from HTML
def extract_headlines(html_content, file_index):
soup = BeautifulSoup(html_content, 'html.parser')
headlines = []
for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
for header in soup.find_all(tag):
# Create an anchor link for each headline
anchor = f"section_{file_index}_{len(headlines)}"
header['id'] = anchor
headlines.append((header.text.strip(), tag, anchor))
return headlines, str(soup)
# Function to gather all HTML files recursively
def gather_html_files(directory):
html_files = []
for root, _, files in os.walk(directory):
for file in files:
if file.endswith('.ipynb.html'):
html_files.append(os.path.join(root, file))
return sorted(html_files) # Sorting ensures the order is preserved
# Specify the top-level folder containing the HTML files
top_level_folder = 'csharp'
output_html = 'combined_with_toc.html'
# Gather all HTML files recursively
html_files = gather_html_files(top_level_folder)
toc_entries = []
full_html_content = "<html><head><title>Document with TOC</title></head><body>"
# Generate the TOC and combine HTML files
for i, filepath in enumerate(html_files):
with open(filepath, 'r', encoding='utf-8') as infile:
content = infile.read()
headlines, updated_html = extract_headlines(content, i)
toc_entries.extend(headlines)
full_html_content += updated_html
full_html_content += '<div style="page-break-after: always;"></div>'
# Create the TOC HTML structure
toc_html = '<h1>Table of Contents</h1><ul>'
for text, tag, anchor in toc_entries:
toc_html += f'<li><a href="#{anchor}">{text}</a></li>'
toc_html += '</ul><div style="page-break-after: always;"></div>'
# Add the TOC to the beginning of the document
full_html_content = toc_html + full_html_content + "</body></html>"
# Write the combined HTML with TOC to a file
with open(output_html, 'w', encoding='utf-8') as outfile:
outfile.write(full_html_content)
# Convert the combined HTML file with TOC to PDF
pdfkit.from_file(output_html, 'output_with_toc.pdf')
print("PDF with Table of Contents has been generated as 'output_with_toc.pdf'.")
Requirements:¶
- wkhtmltopdf: Make sure
wkhtmltopdf
is installed on your system forpdfkit
to work. You can download it from wkhtmltopdf.org. - HTML Structure: Ensure that your HTML files are well-formed, with proper heading tags for the TOC to be generated accurately.
This script will produce a PDF with a generated Table of Contents at the beginning, linking to all the headlines within your HTML files.