Deduplicate 50k rows historical persons
Linking a dataset of real historical persons¶
In this example, we deduplicate a more realistic dataset. The data is based on historical persons scraped from wikidata. Duplicate records are introduced with a variety of errors introduced.
from splink.duckdb.duckdb_linker import DuckDBLinker
import altair as alt
alt.renderers.enable('mimetype')
import pandas as pd
pd.options.display.max_rows = 1000
df = pd.read_parquet("./data/historical_figures_with_errors_50k.parquet")
# Simple settings dictionary will be used for exploratory analysis
settings = {
"link_type": "dedupe_only",
"blocking_rules_to_generate_predictions": [
"l.first_name = r.first_name and l.surname = r.surname",
"l.surname = r.surname and l.dob = r.dob",
"l.first_name = r.first_name and l.dob = r.dob",
"l.postcode_fake = r.postcode_fake and l.first_name = r.first_name",
],
}
linker = DuckDBLinker(df, settings)
linker.profile_columns(
["first_name", "postcode_fake", "substr(dob, 1,4)"], top_n=10, bottom_n=5
)
linker.cumulative_num_comparisons_from_blocking_rules_chart()
import splink.duckdb.duckdb_comparison_library as cl
settings = {
"link_type": "dedupe_only",
"blocking_rules_to_generate_predictions": [
"l.first_name = r.first_name and l.surname = r.surname",
"l.surname = r.surname and l.dob = r.dob",
"l.first_name = r.first_name and l.dob = r.dob",
"l.postcode_fake = r.postcode_fake and l.first_name = r.first_name",
],
"comparisons": [
cl.jaro_winkler_at_thresholds("first_name", [0.9, 0.7], term_frequency_adjustments=True),
cl.jaro_winkler_at_thresholds("surname", [0.9, 0.7], term_frequency_adjustments=True),
cl.levenshtein_at_thresholds("dob", [1,2], term_frequency_adjustments=True),
cl.levenshtein_at_thresholds("postcode_fake", 2,term_frequency_adjustments=True),
cl.exact_match("birth_place", term_frequency_adjustments=True),
cl.exact_match("occupation", term_frequency_adjustments=True),
],
"retain_matching_columns": True,
"retain_intermediate_calculation_columns": True,
"max_iterations": 10,
"em_convergence": 0.01
}
linker = DuckDBLinker(df, settings, connection=":temporary:")
linker.estimate_probability_two_random_records_match(
[
"l.first_name = r.first_name and l.surname = r.surname and l.dob = r.dob",
"substr(l.first_name,1,2) = substr(r.first_name,1,2) and l.surname = r.surname and substr(l.postcode_fake,1,2) = substr(r.postcode_fake,1,2)",
"l.dob = r.dob and l.postcode_fake = r.postcode_fake",
],
recall=0.6,
)
linker.estimate_u_using_random_sampling(target_rows=5e6)
blocking_rule = "l.first_name = r.first_name and l.surname = r.surname"
training_session_names = linker.estimate_parameters_using_expectation_maximisation(blocking_rule)
blocking_rule = "l.dob = r.dob"
training_session_dob = linker.estimate_parameters_using_expectation_maximisation(blocking_rule)
The final match weights can be viewed in the match weights chart:
linker.match_weights_chart()
linker.unlinkables_chart()
df_predict = linker.predict()
df_e = df_predict.as_pandas_dataframe(limit=5)
df_e
You can also view rows in this dataset as a waterfall chart as follows:
from splink.charts import waterfall_chart
records_to_plot = df_e.to_dict(orient="records")
linker.waterfall_chart(records_to_plot, filter_nulls=False)
clusters = linker.cluster_pairwise_predictions_at_threshold(df_predict, threshold_match_probability=0.95)
linker.cluster_studio_dashboard(df_predict, clusters, "50k_cluster.html", sampling_method='by_cluster_size', overwrite=True)
from IPython.display import IFrame
IFrame(
src="./50k_cluster.html", width="100%", height=1200
)
linker.roc_chart_from_labels_column("cluster",match_weight_round_to_nearest=0.02)
records = linker.prediction_errors_from_labels_column(
"cluster",
threshold=0.999,
include_false_negatives=False,
include_false_positives=True,
).as_record_dict()
linker.waterfall_chart(records)
# Some of the false negatives will be because they weren't detected by the blocking rules
records = linker.prediction_errors_from_labels_column(
"cluster",
threshold=0.5,
include_false_negatives=True,
include_false_positives=False,
).as_record_dict(limit=50)
linker.waterfall_chart(records)