Add simple HTML minimizer for gooder compression

author: UltraQbik <no1skill@yandex.ru> 2024-08-22 03:00:37 +0300
committer: UltraQbik <no1skill@yandex.ru> 2024-08-22 03:00:37 +0300
commit: eedb57f893986091887d1a889d6752ea4b06ba2a (patch)
tree: 92aee69db8d6dba0e3b42341c8b40553c7ced851
parent: 76ac2995b1c460f852816a3271fc7777d12b632d (diff)
download: httpy-eedb57f893986091887d1a889d6752ea4b06ba2a.tar.gz
httpy-eedb57f893986091887d1a889d6752ea4b06ba2a.zip
3 files changed, 71 insertions, 1 deletions
diff --git a/main.py b/main.py
index e85e536..f22b957 100644
--- a/main.py
+++ b/main.py
@@ -11,6 +11,7 @@ import signal
 import asyncio
 import aiofiles
 from src.request import Request
+from src.minimizer import minimize_html
 
 
 # path mapping
@@ -168,6 +169,10 @@ class HTTPServer:
             async with aiofiles.open(PATH_MAP[request.path]["path"], "rb") as f:
                 data = await f.read()
 
+            # pre-compress data for HTML files
+            if PATH_MAP[request.path]["path"][-4:] == "html":
+                data = minimize_html(data)
+
             # add gzip compression header (if supported)
             headers = {}
             if "gzip" in compressions:
diff --git a/src/minimizer.py b/src/minimizer.py
new file mode 100644
index 0000000..af1c015
--- /dev/null
+++ b/src/minimizer.py
@@ -0,0 +1,65 @@
+import re
+# import htmlmin
+
+
+def minimize_html(html: bytes) -> bytes:
+    """
+    Minimizes HTML files.
+    Slightly better than htmlmin for my files,
+    but maybe I break something in process and I don't notice
+    """
+
+    html = bytearray(html)
+
+    # remove newlines
+    html = (html
+            .replace(b'\r', b'')
+            .replace(b'\n', b''))
+
+    # remove double spaces
+    size = len(html)
+    while True:
+        html = html.replace(b'  ', b'')
+
+        # if nothing changes -> break
+        if size == len(html):
+            break
+        size = len(html)
+
+    # simplify '> <' to '><'
+    html = html.replace(b'> <', b'><')
+
+    # remove unnecessary quotes
+    index = 0
+    for tag in re.findall(r"<.*?>", html.decode("utf8")):
+        index = html.find(tag.encode("utf8"), index)
+        processed = (tag
+                     .replace("\"", "")
+                     .replace(": ", ":")
+                     .replace("; ", ";"))
+        if len(processed) < len(tag):
+            html[index:index+len(tag)] = (html[index:index+len(tag)]
+                                          .replace(tag.encode("utf8"), processed.encode("utf8"), 1))
+
+    return html
+
+
+def test():
+    with open("../www/about.html", "rb") as file:
+        original = file.read()
+
+    processed = minimize_html(original)
+
+    print(f"Original : {len(original)}\n"
+          f"Processed: {len(processed)}\n"
+          f"Rate     : {(1 - len(processed) / len(original)) * 100:.2f}%", end="\n\n")
+
+    # processed = htmlmin.minify(original.decode("utf8"), True, True, True, True, True)
+    #
+    # print(f"Original : {len(original)}\n"
+    #       f"Processed: {len(processed)}\n"
+    #       f"Rate     : {(1 - len(processed) / len(original)) * 100:.2f}%")
+
+
+if __name__ == '__main__':
+    test()
diff --git a/www/index.html b/www/index.html
index e787340..c2ef300 100644
--- a/www/index.html
+++ b/www/index.html
@@ -23,7 +23,7 @@
             <h1> What is it running? </h1>
             <p> > This server is run by the shitty python code I wrote </p>
             <p> > Server does not use flask or any other similar python web frameworks </p>
-            <p> > It primarily uses standard python libraries, with 2 libraries being an exception </p>
+            <p> > It primarily uses standard python libraries, with 1 library being an exception </p>
             <p> > <i> aiofiles </i> - for asynchronous file I/O </p>
         </section>
         <section>
author	UltraQbik <no1skill@yandex.ru>	2024-08-22 03:00:37 +0300
committer	UltraQbik <no1skill@yandex.ru>	2024-08-22 03:00:37 +0300
commit	eedb57f893986091887d1a889d6752ea4b06ba2a (patch)
tree	92aee69db8d6dba0e3b42341c8b40553c7ced851
parent	76ac2995b1c460f852816a3271fc7777d12b632d (diff)
download	httpy-eedb57f893986091887d1a889d6752ea4b06ba2a.tar.gz httpy-eedb57f893986091887d1a889d6752ea4b06ba2a.zip