Refactor convert command to remove polars entirely

2024-04-15 22:23:00 +10:00 · 2024-04-15 22:23:00 +10:00 · 27a8d2584d
commit 27a8d2584d
parent 3f61d7fac6
3 changed files with 12 additions and 53 deletions
--- a/ipmap.py
+++ b/ipmap.py
@ -16,7 +16,6 @@ from png import Writer
 from cmap import Colormap
 from hilbert import decode
 import numpy as np
 import polars as pl
 ip_bytes = 4
 ip_bits = ip_bytes * 8
@ -56,17 +55,17 @@ def make_coords(output_path: Path, batches = default_batches, processes = defaul
            print(f"finished writing to file")
 def convert(input_path: Path, output_path: Path):
-    print(f"scanning csv '{input_path}' into array...", end = " ", flush = True)
+    print(f"reading csv '{input_path}' into array...", end = " ", flush = True)
-    lf = pl.scan_csv(input_path, schema = {
+    arr = np.loadtxt(input_path, dtype = np.uint32, delimiter = ",", skiprows = 1)
-        "saddr_raw": pl.UInt32,
+    print("done")
-        "rtt_us": pl.UInt64,
+    print("filtering out unsuccessful values...", end = " ", flush = True)
-        "success": pl.UInt8
+    arr = arr[arr[:, -1] == 1]
-    })
+    print("done")
-    lf = lf.filter(pl.col("success") == 1)
+    print("removing success column...", end = " ", flush = True)
-    lf = lf.drop("success")
+    arr = arr[:, :-1]
-    lf = lf.with_columns(rtt_us = pl.col("rtt_us").clip(0, 0xFFFFFFFF).cast(pl.UInt32))
+    print("done")
-    lf = lf.unique("saddr_raw")
+    print("removing duplicate IP addresses...", end = " ", flush = True)
-    arr = lf.collect().to_numpy()
+    arr = arr[np.unique(arr[:, 0], return_index = True)[1]]
    print("done")
    print("converting IP addresses from big-endian to little-endian...", end = " ", flush = True)
    arr[:, 0].byteswap(inplace = True)
--- a/poetry.lock
+++ b/poetry.lock
@ -264,45 +264,6 @@ files = [
    {file = "packaging-24.0.tar.gz", hash = "sha256:eb82c5e3e56209074766e6885bb04b8c38a0c015d0a30036ebe7ece34c9989e9"},
 ]
 [[package]]
 name = "polars-lts-cpu"
 version = "0.20.17"
 description = "Blazingly fast DataFrame library"
 optional = false
 python-versions = ">=3.8"
 files = [
    {file = "polars_lts_cpu-0.20.17-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:c5ba1113df88bd0e46bc2e649279f1e2f09f20d24a7e3a8b07d342d1e117bf40"},
    {file = "polars_lts_cpu-0.20.17-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:514e833c63d2734d9028ca754fe441479cb8d68d06efe9f88fdb348db9578941"},
    {file = "polars_lts_cpu-0.20.17-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3512862da0bcb764ed5e63bb122d265295d503e5294c839d5f46f88937543cc1"},
    {file = "polars_lts_cpu-0.20.17-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:2a30789e25a07e0c925e6fde030d2ee53024ae621a0194c423ff83f359d5f62c"},
    {file = "polars_lts_cpu-0.20.17-cp38-abi3-win_amd64.whl", hash = "sha256:b5a3487d481517525d7c9b9c69210f123c2d1f233c47487fa058646c2dc3d42c"},
    {file = "polars_lts_cpu-0.20.17.tar.gz", hash = "sha256:e11eb08f9264459339af4942c4be9c187daf2ffe4040d24284582e4e0e492ab7"},
 ]
 [package.extras]
 adbc = ["adbc-driver-manager", "adbc-driver-sqlite"]
 all = ["polars[adbc,async,cloudpickle,connectorx,deltalake,fastexcel,fsspec,gevent,numpy,pandas,plot,pyarrow,pydantic,pyiceberg,sqlalchemy,timezone,xlsx2csv,xlsxwriter]"]
 async = ["nest-asyncio"]
 cloudpickle = ["cloudpickle"]
 connectorx = ["connectorx (>=0.3.2)"]
 deltalake = ["deltalake (>=0.14.0)"]
 fastexcel = ["fastexcel (>=0.9)"]
 fsspec = ["fsspec"]
 gevent = ["gevent"]
 matplotlib = ["matplotlib"]
 numpy = ["numpy (>=1.16.0)"]
 openpyxl = ["openpyxl (>=3.0.0)"]
 pandas = ["pandas", "pyarrow (>=7.0.0)"]
 plot = ["hvplot (>=0.9.1)"]
 pyarrow = ["pyarrow (>=7.0.0)"]
 pydantic = ["pydantic"]
 pyiceberg = ["pyiceberg (>=0.5.0)"]
 pyxlsb = ["pyxlsb (>=1.0)"]
 sqlalchemy = ["pandas", "sqlalchemy"]
 timezone = ["backports-zoneinfo", "tzdata"]
 xlsx2csv = ["xlsx2csv (>=0.8.0)"]
 xlsxwriter = ["xlsxwriter"]
 [[package]]
 name = "pydantic"
 version = "2.6.4"
@ -580,4 +541,4 @@ files = [
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "882810214ec005c8e1d0b99099d0f9fc8d6e8fb9140ac9f452e18e7e3c580176"
+content-hash = "3ca6841a3434879d43d536188bf827e8a74f959cbac3da3d272dc1cc47769620"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -10,7 +10,6 @@ python = "^3.11"
 pypng = "^0.20220715.0"
 numpy = "^1.26.4"
 numpy-hilbert-curve = "^1.0.1"
 polars-lts-cpu = "^0.20.17"
 cmap = "^0.1.3"
 fastapi = "^0.110.1"
 uvicorn = "^0.29.0"