testing

archive
Author

Cody

Published

April 1, 1

Testing in soon-to-be-past.

Note

If not rendered properly, view on dkdc.dev.

Some Python code:

import ibis
import ibis.selectors as s
import plotly.io as pio
import plotly.express as px


# configuration
pio.templates.default = "plotly_dark"
ibis.options.interactive = True

t = ibis.examples.penguins.fetch()
t
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓
┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃
┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩
│ stringstringfloat64float64int64int64stringint64 │
├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤
│ Adelie Torgersen39.118.71813750male  2007 │
│ Adelie Torgersen39.517.41863800female2007 │
│ Adelie Torgersen40.318.01953250female2007 │
│ Adelie TorgersennannanNULLNULLNULL2007 │
│ Adelie Torgersen36.719.31933450female2007 │
│ Adelie Torgersen39.320.61903650male  2007 │
│ Adelie Torgersen38.917.81813625female2007 │
│ Adelie Torgersen39.219.61954675male  2007 │
│ Adelie Torgersen34.118.11933475NULL2007 │
│ Adelie Torgersen42.020.21904250NULL2007 │
│  │
└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘
t.group_by("species").agg(ibis._.count().name("count"))
┏━━━━━━━━━━━┳━━━━━━━┓
┃ species    count ┃
┡━━━━━━━━━━━╇━━━━━━━┩
│ stringint64 │
├───────────┼───────┤
│ Adelie   152 │
│ Gentoo   124 │
│ Chinstrap68 │
└───────────┴───────┘
t["species"].topk(5)
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┓
┃ species    Count(species) ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━┩
│ stringint64          │
├───────────┼────────────────┤
│ Adelie   152 │
│ Gentoo   124 │
│ Chinstrap68 │
└───────────┴────────────────┘
px.scatter(t, title="penguins", x="bill_length_mm", y="bill_depth_mm", color="species")
def transform(t):
    # compute the z score
    t = t.mutate(
        s.across(s.numeric(), {"zscore": lambda x: (x - x.mean()) / x.std()})
    ).dropna()  # drop rows with missing values
    return t


f = transform(t)
f
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓
┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year   bill_length_mm_zscore  bill_depth_mm_zscore  flipper_length_mm_zscore  body_mass_g_zscore  year_zscore ┃
┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩
│ stringstringfloat64float64int64int64stringint64float64float64float64float64float64     │
├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┼───────────────────────┼──────────────────────┼──────────────────────────┼────────────────────┼─────────────┤
│ Adelie Torgersen39.118.71813750male  2007-0.8831370.784218-1.416243-0.563316-1.258032 │
│ Adelie Torgersen39.517.41863800female2007-0.8098770.125990-1.060674-0.500969-1.258032 │
│ Adelie Torgersen40.318.01953250female2007-0.6633570.429788-0.420652-1.186793-1.258032 │
│ Adelie Torgersen36.719.31933450female2007-1.3226981.088015-0.562879-0.937402-1.258032 │
│ Adelie Torgersen39.320.61903650male  2007-0.8465071.746243-0.776220-0.688012-1.258032 │
│ Adelie Torgersen38.917.81813625female2007-0.9197670.328522-1.416243-0.719185-1.258032 │
│ Adelie Torgersen39.219.61954675male  2007-0.8648221.239914-0.4206520.590115-1.258032 │
│ Adelie Torgersen41.117.61823200female2007-0.5168370.227256-1.345129-1.249140-1.258032 │
│ Adelie Torgersen38.621.21913800male  2007-0.9747122.050041-0.705106-0.500969-1.258032 │
│ Adelie Torgersen34.621.11984400male  2007-1.7073131.999408-0.2073110.247203-1.258032 │
│  │
└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┴───────────────────────┴──────────────────────┴──────────────────────────┴────────────────────┴─────────────┘
from sklearn.decomposition import PCA

# select "features" as X
X = f.select(s.contains("zscore"))

# get the the first 2 principal components to visualize
n_components = 3
pca = PCA(n_components=n_components).fit(X)

# transform the table to get the principal components
t_pca = ibis.memtable(pca.transform(X)).relabel({"col0": "pc1", "col1": "pc2", "col2": "pc3"})

# join the original table with the PCA table, assuming the order is the same
f = f.mutate(row_number=ibis.row_number().over()).join(
    t_pca.mutate(row_number=ibis.row_number().over()), "row_number"
)

# plot the first 3 principal components
px.scatter_3d(f, title="penguins PCA", x="pc1", y="pc2", z="pc3", color="species")

graph TD

  %% Extraction
  JSON -->|Extract| CLOUDSTORAGE
  PARQUET -->|Extract| CLOUDSTORAGE
  DELTASRC -->|Extract| CLOUDSTORAGE
  
  %% Transformation
  CLOUDSTORAGE -->|Transform| DUCKDB
  CLOUDSTORAGE -->|Transform| POLARS
  
  %% Load
  DUCKDB -->|Load| DELTADST
  DUCKDB -->|Load| DBOUTPUT
  POLARS -->|Load| DELTADST
  POLARS -->|Load| DBOUTPUT
  
  classDef dataFormat fill:#f9d,stroke:#333,stroke-width:2px;
  classDef storage fill:#9df,stroke:#333,stroke-width:2px;
  classDef processing fill:#fd9,stroke:#333,stroke-width:2px;
  classDef output fill:#d9f,stroke:#333,stroke-width:2px;

  class JSON,PARQUET,DELTASRC dataFormat;
  class CLOUDSTORAGE storage;
  class DUCKDB,POLARS processing;
  class DELTADST,DBOUTPUT output;

Back to top