Quick and dirty persons model

Historical people: Quick and dirty¶

This example shows how to get some initial record linkage results as quickly as possible.

There are many ways to improve the accuracy of this model. But this may be a good place to start if you just want to give Splink a try and see what it's capable of.

from splink.datasets import splink_datasets

df = splink_datasets.historical_50k
df.head(5)

	unique_id	cluster	full_name	first_and_surname	first_name	surname	dob	birth_place	postcode_fake	gender	occupation
0	Q2296770-1	Q2296770	thomas clifford, 1st baron clifford of chudleigh	thomas chudleigh	thomas	chudleigh	1630-08-01	devon	tq13 8df	male	politician
1	Q2296770-2	Q2296770	thomas of chudleigh	thomas chudleigh	thomas	chudleigh	1630-08-01	devon	tq13 8df	male	politician
2	Q2296770-3	Q2296770	tom 1st baron clifford of chudleigh	tom chudleigh	tom	chudleigh	1630-08-01	devon	tq13 8df	male	politician
3	Q2296770-4	Q2296770	thomas 1st chudleigh	thomas chudleigh	thomas	chudleigh	1630-08-01	devon	tq13 8hu	None	politician
4	Q2296770-5	Q2296770	thomas clifford, 1st baron chudleigh	thomas chudleigh	thomas	chudleigh	1630-08-01	devon	tq13 8df	None	politician

from splink import block_on, SettingsCreator
import splink.comparison_library as cl


settings = SettingsCreator(
    link_type="dedupe_only",
    blocking_rules_to_generate_predictions=[
        block_on("full_name"),
        block_on("substr(full_name,1,6)", "dob", "birth_place"),
        block_on("dob", "birth_place"),
        block_on("postcode_fake"),
    ],
    comparisons=[
        cl.ForenameSurnameComparison(
            "first_name",
            "surname",
            forename_surname_concat_col_name="first_and_surname",
        ),
        cl.DateOfBirthComparison(
            "dob",
            input_is_string=True,
        ),
        cl.LevenshteinAtThresholds("postcode_fake", 2),
        cl.JaroWinklerAtThresholds("birth_place", 0.9).configure(
            term_frequency_adjustments=True
        ),
        cl.ExactMatch("occupation").configure(term_frequency_adjustments=True),
    ],
)

from splink import Linker, DuckDBAPI


linker = Linker(df, settings, db_api=DuckDBAPI(), set_up_basic_logging=False)
deterministic_rules = [
    "l.full_name = r.full_name",
    "l.postcode_fake = r.postcode_fake and l.dob = r.dob",
]

linker.training.estimate_probability_two_random_records_match(
    deterministic_rules, recall=0.6
)

linker.training.estimate_u_using_random_sampling(max_pairs=2e6)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

results = linker.inference.predict(threshold_match_probability=0.9)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))



 -- WARNING --
You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'first_name_surname':
    m values not fully trained
Comparison: 'first_name_surname':
    u values not fully trained
Comparison: 'dob':
    m values not fully trained
Comparison: 'postcode_fake':
    m values not fully trained
Comparison: 'birth_place':
    m values not fully trained
Comparison: 'occupation':
    m values not fully trained

results.as_pandas_dataframe(limit=5)

	match_weight	match_probability	unique_id_l	unique_id_r	first_name_l	first_name_r	surname_l	surname_r	first_and_surname_l	first_and_surname_r	...	gamma_postcode_fake	birth_place_l	birth_place_r	gamma_birth_place	occupation_l	occupation_r	gamma_occupation	full_name_l	full_name_r	match_key
0	3.170005	0.900005	Q7412607-1	Q7412607-3	samuel	samuel	shelley	shelley	samuel shelley	samuel shelley	...	0	whitechapel	city of london	0	illuminator	illuminator	1	samuel shelley	samuel shelley	0
1	3.170695	0.900048	Q15997578-4	Q15997578-7	job	wilding	wilding	None	job wilding	wilding	...	-1	wrexham	wrexham	2	association football player	association football player	1	job wilding	wilding	2
2	3.170695	0.900048	Q15997578-2	Q15997578-7	job	wilding	wilding	None	job wilding	wilding	...	-1	wrexham	wrexham	2	association football player	association football player	1	job wilding	wilding	2
3	3.170695	0.900048	Q15997578-1	Q15997578-7	job	wilding	wilding	None	job wilding	wilding	...	-1	wrexham	wrexham	2	association football player	association football player	1	job wilding	wilding	2
4	3.172553	0.900164	Q5726641-11	Q5726641-8	henry	harry	page	paige	henry page	harry paige	...	2	staffordshire moorlands	staffordshire moorlands	2	cricketer	cricketer	1	henry page	harry paige	3

5 rows × 26 columns