Skip to content

Work in Progress

This page is currently under construction.

cluster_studio_dashboard¶

At a glance

Useful for:

API Documentation: cluster_studio_dashboard()

What is needed to generate the chart?

Worked Example¶

from splink.duckdb.linker import DuckDBLinker
import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_template_library as ctl
from splink.duckdb.blocking_rule_library import block_on
from splink.datasets import splink_datasets
import logging, sys
logging.disable(sys.maxsize)

df = splink_datasets.fake_1000

settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        block_on("first_name"),
        block_on("surname"),
    ],
    "comparisons": [
        ctl.name_comparison("first_name"),
        ctl.name_comparison("surname"),
        ctl.date_comparison("dob", cast_strings_to_date=True),
        cl.exact_match("city", term_frequency_adjustments=True),
        ctl.email_comparison("email", include_username_fuzzy_level=False),
    ],
    "retain_intermediate_calculation_columns": True,
    "retain_matching_columns":True,
}

linker = DuckDBLinker(df, settings)
linker.estimate_u_using_random_sampling(max_pairs=1e6)

blocking_rule_for_training = block_on(["first_name", "surname"])

linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)

blocking_rule_for_training = block_on("dob")
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)

df_predictions = linker.predict(threshold_match_probability=0.2)
df_clusters = linker.cluster_pairwise_predictions_at_threshold(df_predictions, threshold_match_probability=0.5)

linker.cluster_studio_dashboard(df_predictions, df_clusters, "img/cluster_studio.html", sampling_method="by_cluster_size", overwrite=True)

# You can view the scv.html file in your browser, or inline in a notbook as follows
from IPython.display import IFrame
IFrame(
    src="./img/cluster_studio.html", width="100%", height=1200
)

What the chart shows¶

How to interpret the chart¶

Actions to take as a result of the chart¶