df:
col1
['aa', 'bb', 'cc', 'dd']
['this', 'is', 'a', 'list', '2']
['this', 'list', '3']col2
[['ee', 'ff', 'gg', 'hh'], ['qq', 'ww', 'ee', 'rr']]
[['list', 'a', 'not', '1'], ['not', 'is', 'this', '2']]
[['this', 'is', 'list', 'not'], ['a', 'not', 'list', '2']]
What I'm trying to do:
I am trying to run the code below on each element (word) in df col1
on each corresponding element in each of the sublists in col2
, and put the scores in a new column.
So for the first row in col1
, run the get_top_matches
function on this:
`col1` "aa" and `col2` "ee" and "qq"
`col1` "bb" and `col2` "ff" and "ww"
`col1` "cc" and `col2` "gg" and "ee"
`col1` "dd" and `col2` "hh" and "rr"
What the new column should look like:
I don't know for sure what row 2 and 3 scores should be
score_col
[1.0, 1.0, 1.0, 1.0]
[.34, .33, .27, .24, .23] #not sure
[.23, .13, .26] #not sure
What I've tried before:
I've done when col1
was just a string against each list element in col2
, like this, but i don't have the slightest idea how to run it against list elements to corresponding sublist elements:
df.agg(lambda x: get_top_matches(*x), axis=1)
. . . .
The Function Code
Here's the get_top_matches
function - just run this whole thing; i'm only calling the last function for this question:
#jaro version
def sort_token_alphabetically(word):token = re.split('[,. ]', word)sorted_token = sorted(token)return ' '.join(sorted_token)def get_jaro_distance(first, second, winkler=True, winkler_ajustment=True,scaling=0.1, sort_tokens=True):""":param first: word to calculate distance for:param second: word to calculate distance with:param winkler: same as winkler_ajustment:param winkler_ajustment: add an adjustment factor to the Jaro of the distance:param scaling: scaling factor for the Winkler adjustment:return: Jaro distance adjusted (or not)"""if sort_tokens:first = sort_token_alphabetically(first)second = sort_token_alphabetically(second)if not first or not second:raise JaroDistanceException("Cannot calculate distance from NoneType ({0}, {1})".format(first.__class__.__name__,second.__class__.__name__))jaro = _score(first, second)cl = min(len(_get_prefix(first, second)), 4)if all([winkler, winkler_ajustment]): # 0.1 as scaling factorreturn round((jaro + (scaling * cl * (1.0 - jaro))) * 100.0) / 100.0return jarodef _score(first, second):shorter, longer = first.lower(), second.lower()if len(first) > len(second):longer, shorter = shorter, longerm1 = _get_matching_characters(shorter, longer)m2 = _get_matching_characters(longer, shorter)if len(m1) == 0 or len(m2) == 0:return 0.0return (float(len(m1)) / len(shorter) +float(len(m2)) / len(longer) +float(len(m1) - _transpositions(m1, m2)) / len(m1)) / 3.0def _get_diff_index(first, second):if first == second:passif not first or not second:return 0max_len = min(len(first), len(second))for i in range(0, max_len):if not first[i] == second[i]:return ireturn max_lendef _get_prefix(first, second):if not first or not second:return ""index = _get_diff_index(first, second)if index == -1:return firstelif index == 0:return ""else:return first[0:index]def _get_matching_characters(first, second):common = []limit = math.floor(min(len(first), len(second)) / 2)for i, l in enumerate(first):left, right = int(max(0, i - limit)), int(min(i + limit + 1, len(second)))if l in second[left:right]:common.append(l)second = second[0:second.index(l)] + '*' + second[second.index(l) + 1:]return ''.join(common)def _transpositions(first, second):return math.floor(len([(f, s) for f, s in zip(first, second) if not f == s]) / 2.0)def get_top_matches(reference, value_list, max_results=None):scores = []if not max_results:max_results = len(value_list)for val in value_list:score_sorted = get_jaro_distance(reference, val)score_unsorted = get_jaro_distance(reference, val, sort_tokens=False)scores.append((val, max(score_sorted, score_unsorted)))scores.sort(key=lambda x: x[1], reverse=True)return scores[:max_results]class JaroDistanceException(Exception):def __init__(self, message):super(Exception, self).__init__(message)
. . .
Attempt 1 Just trying to get this to compare to each word in the lists rather than each letter:
[[[df1.agg(lambda x: get_top_matches(u,w), axis=1) for u,w in zip(x,v)]\ for v in y] for x,y in zip(df1['parent_org_name_list'], df1['children_org_name_sublists'])]
Attempt 2
Changing the get_top_matches
function to say for val in value_list.split():
resulted in this below - which grabs the first word and compares it to the first word in each sublist in col2
5 times (not sure why 5 times):
[[0 [(myalyk, 0.73)]1 [(myalyk, 0.73)]2 [(myalyk, 0.73)]3 [(myalyk, 0.73)]4 [(myalyk, 0.73)]dtype: object]
, [0 [(myliu, 0.79)]1 [(myliu, 0.79)]2 [(myliu, 0.79)]3 [(myliu, 0.79)]4 [(myliu, 0.79)]dtype: object]
, [0 [(myllc, 0.97)]1 [(myllc, 0.97)]2 [(myllc, 0.97)]3 [(myllc, 0.97)]4 [(myllc, 0.97)]dtype: object]
, [0 [(myloc, 0.88)]1 [(myloc, 0.88)]2 [(myloc, 0.88)]3 [(myloc, 0.88)]4 [(myloc, 0.88)]dtype: object]
]
Just need the function to run on each word in the sublists.
Attempt 3
Removing the second attempt code from the get_top_matches
function and modifying the attempt one list comprehension code to below, grabbed the first word in the first 3 sublists in col2
; need to compare against the col1
list to each word in the col2
sublists:
[[df.agg(lambda x: get_top_matches(u,v), axis=1) for u in x ]for v in zip(*y)]for x,y in zip(df['col1'], df['col2'])
]
results to attempt 3
[[0 [(myllc, 0.97), (myloc, 0.88), (myliu, 0.79),
...1 [(myllc, 0.97), (myloc, 0.88), (myliu, 0.79),
...2 [(myllc, 0.97), (myloc, 0.88), (myliu, 0.79),
...3 [(myllc, 0.97), (myloc, 0.88), (myliu, 0.79),
...4 [(myllc, 0.97), (myloc, 0.88), (myliu, 0.79),
...dtype: object]]
Expectation (this example: row 1 has 4 sublists, row 2 has 2 sublists. the function runs on each word in each column 1 for each word in each sublist in column 2 and puts the results in a sublist in a new column.)
[[['myalyk',.97], ['oleksandr',.54], ['nychyporovych',.3], ['pp',0]], [['myliu',.88], ['srl',.43]], [['myllc',1.0]], [['myloc',1.0], ['manag',.45], ['IT',.1], ['ag',0]]],
[[['ltd',.34], ['yuriapharm',.76]], [['yuriypra',.65], ['law',.54], ['offic',.45], ['pc',.34]]],
...