Skip to content

Comparison values

Run the following cell to get an interactive interface for trying out comparisons from the comparison_library.

The interface uses the default arguments for each comparison, and can only be used for comparisons that refer to a single column

In the following cells, you can get similar results for custom comparisons with an arbitrary number of input columns.

```python import splink.comparison_library as cl from splink.internals.testing import comparison_vector_value, is_in_level from splink import DuckDBAPI import ipywidgets as widgets from IPython.display import Markdown

def report_comparison_levels(comparison, values_dict, column_name): db_api = DuckDBAPI() levels = comparison.create_comparison_levels() comparison_dict = comparison.get_comparison("duckdb").as_dict()

table_rows = []
table_rows.append("| Match | Level | Description | SQL Condition |")
table_rows.append("|-------|-------|-------------|---------------|")

total_levels = len(levels)
matched_level = None
for i, level in enumerate(levels):
    level_number = total_levels - i - 1
    label = level.create_label_for_charts()
    sql_condition = comparison_dict['comparison_levels'][i]['sql_condition']

    is_match = is_in_level(level, values_dict, db_api)

    if is_match and matched_level is None:
        matched_level = level_number
        match_indicator = "✓"
        row = f"| {match_indicator} | **{level_number}** | **{label}** | `{sql_condition}` |"
    else:
        match_indicator = ""
        row = f"| {match_indicator} | {level_number} | {label} | `{sql_condition}` |"

    table_rows.append(row)

markdown_output = "\n".join(table_rows)
return markdown_output

def create_comparison_playground(column_name):

comparison_types = [
    'ExactMatch', 'LevenshteinAtThresholds', 'JaroAtThresholds',
    'JaroWinklerAtThresholds', 'DamerauLevenshteinAtThresholds',
    'JaccardAtThresholds',
    'AbsoluteDateDifferenceAtThresholds',
    'ArrayIntersectAtSizes', 'DateOfBirthComparison',
     'EmailComparison',
    'NameComparison', 'PostcodeComparison'
]

default_values = {
    'ExactMatch': ('john', 'jon'),
    'LevenshteinAtThresholds': ('smith', 'smyth'),
    'JaroAtThresholds': ('martha', 'matha'),
    'JaroWinklerAtThresholds': ('williams', 'willaims'),
    'DamerauLevenshteinAtThresholds': ('receive', 'recieve'),
    'CosineSimilarityAtThresholds': ('data science', 'science data'),
    'JaccardAtThresholds': ('0123456789', '012345678'),
    'AbsoluteDateDifferenceAtThresholds': ('2023-01-01', '2023-01-15'),
    'ArrayIntersectAtSizes': ('apple,banana,cherry', 'banana,cherry,date'),
    'DateOfBirthComparison': ('1990-05-15', '1990-05-16'),
    'EmailComparison': ('john.doe@example.com', 'john.doe@other.com'),
    'NameComparison': ('Elizabeth', 'Elisabeth'),
    'PostcodeComparison': ('SW1A 1AA', 'SW1A 1AB')
}

db_api = DuckDBAPI()

docstrings = {}
for comp_type in comparison_types:
    class_obj = getattr(cl, comp_type)
    init_doc = getattr(class_obj.__init__, '__doc__', None)
    docstrings[comp_type] = init_doc if init_doc else class_obj.__doc__


def get_comparison(comp_type):
    if comp_type in ['DateOfBirthComparison']:

        return getattr(cl, comp_type)(column_name, input_is_string=True)
    if comp_type == 'AbsoluteDateDifferenceAtThresholds':
        return getattr(cl, comp_type)(column_name, input_is_string=True, metrics=["day", "month"], thresholds=[1, 1])
    elif comp_type in ['EmailComparison', 'ForenameSurnameComparison', 'NameComparison', 'PostcodeComparison', 'ArrayIntersectAtSizes']:
        return getattr(cl, comp_type)(column_name)
    else:
        return getattr(cl, comp_type)(column_name)

def run_comparison(change):
    left_value = left_input.value if left_input.value != "" else None
    right_value = right_input.value if right_input.value != "" else None
    comparison = get_comparison(comparison_select.value)

    if comparison_select.value == 'ArrayIntersectAtSizes':
        left_value = left_value.split(',') if left_value else None
        right_value = right_value.split(',') if right_value else None

    values_dict = {f"{column_name}_l": left_value, f"{column_name}_r": right_value}

    output.clear_output()
    markdown_output = report_comparison_levels(comparison, values_dict, column_name)
    with output:
        display(Markdown("### Comparison levels:"))
        display(Markdown(markdown_output))

        docstring = docstrings.get(comparison_select.value, "No docstring available")
        processed_docstring = "\n".join(line.strip() for line in docstring.split("\n"))
        display(Markdown("### Comparison Function Docstring:"))
        display(Markdown(processed_docstring))


    # Store the markdown output for later use
    playground.markdown_output = markdown_output



def on_comparison_change(change):
    new_value = change['new']
    left_value, right_value = default_values.get(new_value, ('', ''))

    # Temporarily unobserve the input widgets
    left_input.unobserve(run_comparison, names='value')
    right_input.unobserve(run_comparison, names='value')

    # Update the values
    left_input.value = left_value
    right_input.value = right_value

    # Re-observe the input widgets
    left_input.observe(run_comparison, names='value')
    right_input.observe(run_comparison, names='value')

    # Run the comparison once after updating both inputs
    run_comparison(None)


comparison_select = widgets.Dropdown(
    options=comparison_types,
    value='ExactMatch',
    description='Comparison:',
)
left_input = widgets.Text(description=f"{column_name} Left:", value=default_values['ExactMatch'][0])
right_input = widgets.Text(description=f"{column_name} Right:", value=default_values['ExactMatch'][1])
output = widgets.Output()

comparison_select.observe(on_comparison_change, names='value')
for widget in (comparison_select, left_input, right_input):
    widget.observe(run_comparison, names='value')

# Call run_comparison immediately to compute initial output
playground = widgets.VBox([comparison_select, left_input, right_input, output])

run_comparison(None)

return playground

playground = create_comparison_playground("column") display(playground) ```

VBox(children=(Dropdown(description='Comparison:', options=('ExactMatch', 'LevenshteinAtThresholds', 'JaroAtTh…

```python import splink.comparison_level_library as cll

first_name_comparison = cl.CustomComparison( comparison_levels=[ cll.NullLevel("first_name"), cll.ExactMatchLevel("first_name"), { "sql_condition": "first_name_l = surname_r", "label_for_charts": "Match on reversed cols: first_name and surname", }, cll.JaroWinklerLevel("first_name", 0.8), cll.ElseLevel(), ] )

Need to be able to pass values in as a dict

values = { "first_name_l": "Robin", "first_name_r": "Linacre", "surname_l": "Linacre", "surname_r": "Robin", } display(Markdown(report_comparison_levels(first_name_comparison, values, "first_name"))) ```

Match Level Description SQL Condition
4 first_name is NULL "first_name_l" IS NULL OR "first_name_r" IS NULL
3 Exact match on first_name "first_name_l" = "first_name_r"
2 Match on reversed cols: first_name and surname first_name_l = surname_r
1 Jaro-Winkler distance of first_name >= 0.8 jaro_winkler_similarity("first_name_l", "first_name_r") >= 0.8
0 All other comparisons ELSE