diff --git a/examples/python-magic/README.md b/examples/python-magic/README.md new file mode 100644 index 0000000..249a38c --- /dev/null +++ b/examples/python-magic/README.md @@ -0,0 +1,18 @@ +# python-magic Examples + +Each sub-directory contains a self-contained example. The order in +which the examples are to appear is specified in `order.json` (an +array of directory names in the expected order). + +In each example directory you'll find: + +* `config.toml` - must conform to the specification outlined here: + https://docs.pyscript.net/latest/user-guide/configuration/ This is + parsed and ultimately turned into a JSON representation as part of + the package's API object. +* `setup.py` - Python code for contextual and environmental setup, + NOT SEEN BY THE END USER, but is run before the `code.py` code is + evaluated. Allows us to create useful (IPython) shims, avoid + repeating boilerplate and whatnot. +* `code.py` - the actual code added to the editor which forms the + practical example of using the package. diff --git a/examples/python-magic/identify_file_types/code.py b/examples/python-magic/identify_file_types/code.py new file mode 100644 index 0000000..92fcfcf --- /dev/null +++ b/examples/python-magic/identify_file_types/code.py @@ -0,0 +1,54 @@ +""" +A first look at python-magic. + +python-magic is a thin Python wrapper around libmagic, the same engine +behind the Unix `file` command. Given some bytes (or a file path), it +guesses the file type by inspecting the content's signature -- not the +filename or extension. + +Docs: https://github.com/ahupp/python-magic +""" +from IPython.core.display import display, HTML + +import magic + + +# A small "file cabinet" of byte signatures for common file types. +# Each entry is a realistic header we'd find at the start of a file. +file_samples = { + "report.pdf": b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n1 0 obj\n<< /Type /Catalog >>\nendobj\n", + "logo.png": ( + b"\x89PNG\r\n\x1a\n" + b"\x00\x00\x00\rIHDR\x00\x00\x00\x10\x00\x00\x00\x10\x08\x06\x00\x00\x00" + ), + "photo.jpg": b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00", + "archive.zip": b"PK\x03\x04\x14\x00\x00\x00\x08\x00" + b"\x00" * 20, + "notes.txt": b"Dear diary,\nToday I learned about libmagic.\n", + "song.mp3": b"ID3\x04\x00\x00\x00\x00\x00\x00" + b"\x00" * 64, +} + +heading("Guessing file types from raw bytes") +note( + "We pass the first chunk of each file's bytes to " + "magic.from_buffer and let libmagic identify it. " + "Notice that we never look at the filename -- the bytes alone are enough." +) + +# Build an HTML table of filename, libmagic description, and MIME type. +rows = ["FilenameDescriptionMIME type"] +for name, data in file_samples.items(): + description = magic.from_buffer(data) + mime_type = magic.from_buffer(data, mime=True) + rows.append( + f"{name}" + f"{description}" + f"{mime_type}" + ) + +display(HTML("" + "".join(rows) + "
"), append=True) + +note( + "The recommendation from the docs is to feed at least the first 2048 " + "bytes for reliable identification. Shorter buffers can confuse the " + "detection." +) diff --git a/examples/python-magic/identify_file_types/config.toml b/examples/python-magic/identify_file_types/config.toml new file mode 100644 index 0000000..5471e23 --- /dev/null +++ b/examples/python-magic/identify_file_types/config.toml @@ -0,0 +1 @@ +packages = ["python-magic"] diff --git a/examples/python-magic/identify_file_types/setup.py b/examples/python-magic/identify_file_types/setup.py new file mode 100644 index 0000000..54b59b8 --- /dev/null +++ b/examples/python-magic/identify_file_types/setup.py @@ -0,0 +1,43 @@ +""" +Shim IPython's display API onto PyScript so example code written in a +Jupyter/IPython idiom runs unmodified in the browser. +""" + +import sys +import types +import js +from pyscript import window, HTML, display as _display + +js.alert = window.alert + + +def display(*args, **kwargs): + """Wrap pyscript.display so output lands in the example target.""" + return _display( + *args, **kwargs, target=__pyscript_display_target__, + ) + + +ipython = types.ModuleType("IPython") +core = types.ModuleType("IPython.core") +core_display = types.ModuleType("IPython.core.display") +core_display.display = display +core_display.HTML = HTML +ipython.core = core +core.display = core_display +ipython.get_ipython = lambda: None +ipython.display = core_display +sys.modules["IPython"] = ipython +sys.modules["IPython.core"] = core +sys.modules["IPython.core.display"] = core_display +sys.modules["IPython.display"] = core_display + + +def heading(text, level=2): + """Emit an HTML heading so sections are visually separated.""" + display(HTML(f"{text}"), append=True) + + +def note(text): + """Emit a short paragraph of explanatory prose.""" + display(HTML(f"

{text}

"), append=True) diff --git a/examples/python-magic/mime_router/code.py b/examples/python-magic/mime_router/code.py new file mode 100644 index 0000000..7f72ded --- /dev/null +++ b/examples/python-magic/mime_router/code.py @@ -0,0 +1,87 @@ +# --------------------------------------------------------------------- +# Building a tiny content-aware upload router with python-magic. +# --------------------------------------------------------------------- +# +# A common real-world use of python-magic: an "upload handler" that +# decides what to do with a file based on its true type, regardless +# of what the user named it. This protects against mislabeled or +# disguised files (think: a script renamed to look like an image). + +import magic + + +heading("A MIME-based upload router") +note( + "Each incoming upload is a tuple of (claimed filename, bytes). " + "We detect the real MIME type and dispatch to the appropriate " + "handler. A mismatch between the filename's extension and the " + "detected type is flagged as suspicious." +) + +# Pretend these came in over the wire from a web form. +incoming_uploads = [ + ("vacation.jpg", b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01" + b"\x00\x01\x00\x00" + b"\x00" * 200), + ("budget.pdf", b"%PDF-1.5\n%\xe2\xe3\xcf\xd3\n1 0 obj\n<<>>\nendobj\n" + + b"\x00" * 100), + ("backup.zip", b"PK\x03\x04\x14\x00\x00\x00\x08\x00" + b"\x00" * 200), + # Sneaky: claims to be a PNG but is actually plain text. + ("avatar.png", b"#!/bin/sh\necho 'definitely not an image'\n" * 20), + ("readme.txt", b"Welcome to the project!\n\nThis is a friendly readme.\n" * 10), +] + + +# Map MIME prefixes to handler descriptions. In a real app these would +# be functions; here we just describe what would happen. +def route(mime_type): + """Return a (handler_name, action) pair for a detected MIME type.""" + if mime_type.startswith("image/"): + return ("ImageProcessor", "resize and store in /uploads/images") + if mime_type == "application/pdf": + return ("DocumentIndexer", "extract text and add to search index") + if mime_type.startswith("text/"): + return ("TextStore", "save to /uploads/text") + if mime_type in {"application/zip", "application/x-zip-compressed"}: + return ("ArchiveScanner", "scan contents before unpacking") + return ("QuarantineBin", "unknown type, hold for review") + + +# Map common extensions to expected MIME prefixes for sanity-checking. +expected_prefix = { + ".jpg": "image/", ".jpeg": "image/", ".png": "image/", + ".pdf": "application/pdf", + ".zip": "application/zip", + ".txt": "text/", +} + +rows = [ + "FilenameDetected MIME" + "HandlerStatus" +] +for filename, data in incoming_uploads: + mime_type = magic.from_buffer(data, mime=True) + handler, action = route(mime_type) + + # Cross-check the extension against the detected type. + extension = "." + filename.rsplit(".", 1)[-1].lower() + expected = expected_prefix.get(extension, "") + if expected and not mime_type.startswith(expected): + status = "⚠️ extension/content mismatch" + else: + status = "✓ ok" + + rows.append( + f"{filename}" + f"{mime_type}" + f"{handler} — {action}" + f"{status}" + ) + +display(HTML("" + "".join(rows) + "
"), append=True) + +note( + "Notice how avatar.png is correctly identified as a " + "shell script (text/x-shellscript or similar) and " + "flagged. This is exactly the kind of check a libmagic-based router " + "buys you for free." +) diff --git a/examples/python-magic/mime_router/config.toml b/examples/python-magic/mime_router/config.toml new file mode 100644 index 0000000..5471e23 --- /dev/null +++ b/examples/python-magic/mime_router/config.toml @@ -0,0 +1 @@ +packages = ["python-magic"] diff --git a/examples/python-magic/mime_router/setup.py b/examples/python-magic/mime_router/setup.py new file mode 100644 index 0000000..5e3150b --- /dev/null +++ b/examples/python-magic/mime_router/setup.py @@ -0,0 +1,20 @@ +"""Lightweight setup for cell 2 -- no IPython shim, just the names +the first cell already established.""" +import js +from pyscript import window, HTML, display as _display + +js.alert = window.alert + + +def display(*args, **kwargs): + return _display( + *args, **kwargs, target=__pyscript_display_target__, + ) + + +def heading(text, level=2): + display(HTML(f"{text}"), append=True) + + +def note(text): + display(HTML(f"

{text}

"), append=True) diff --git a/examples/python-magic/order.json b/examples/python-magic/order.json new file mode 100644 index 0000000..bab83a3 --- /dev/null +++ b/examples/python-magic/order.json @@ -0,0 +1,4 @@ +[ + "identify_file_types", + "mime_router" +]