Quick and dirty persons model
Historical people: Quick and dirty¶
This example shows how to get some initial record linkage results as quickly as possible.
There are many ways to improve the accuracy of this model. But this may be a good place to start if you just want to give Splink a try and see what it's capable of.
import pandas as pd
df = pd.read_parquet("./data/historical_figures_with_errors_50k.parquet")
df.head(5)
from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb import duckdb_comparison_library as cl
settings = {
"link_type": "dedupe_only",
"blocking_rules_to_generate_predictions": [
"l.full_name = r.full_name",
"substr(l.full_name,1,6) = substr(r.full_name,1,6) and l.dob = r.dob and l.birth_place = r.birth_place",
"l.dob = r.dob and l.birth_place = r.birth_place",
"l.postcode_fake = r.postcode_fake",
],
"comparisons": [
cl.levenshtein_at_thresholds("full_name", [1,3,5], term_frequency_adjustments=True),
cl.levenshtein_at_thresholds("dob", [1,2], term_frequency_adjustments=True),
cl.levenshtein_at_thresholds("postcode_fake", 2),
cl.exact_match("birth_place", term_frequency_adjustments=True),
cl.exact_match("occupation", term_frequency_adjustments=True),
],
}
linker = DuckDBLinker(df, settings, set_up_basic_logging=False)
deterministic_rules = [
"l.full_name = r.full_name",
"l.postcode_fake = r.postcode_fake and l.dob = r.dob",
]
linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.6)
linker.estimate_u_using_random_sampling(target_rows=2e6)
results = linker.predict(threshold_match_probability=0.9)
results.as_pandas_dataframe(limit=5)