Nov 16, 2019

Python fuzzywuzzy

#Ref: https://www.datacamp.com/community/tutorials/fuzzy-string-python

#pip install fuzzywuzzy
#pip install python-Levenshtein

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

print('-------------string matching')
Str1 = "Apple Inc."
Str2 = "apple Inc"
Ratio = fuzz.ratio(Str1.lower(),Str2.lower())
print(Ratio)            #95

print('-------------substring matching')
Str1 = "Los Angeles Lakers"
Str2 = "Lakers"
Ratio = fuzz.ratio(Str1.lower(),Str2.lower())
Partial_Ratio = fuzz.partial_ratio(Str1.lower(),Str2.lower())
print(Ratio)            #50
print(Partial_Ratio)    #100

print('-------------string different order match - same length')
#They tokenize the strings and preprocess them by turning them to lower case and getting rid of punctuation
Str1 = "united states v. nixon"
Str2 = "Nixon v. United States"
Ratio = fuzz.ratio(Str1.lower(),Str2.lower())
Partial_Ratio = fuzz.partial_ratio(Str1.lower(),Str2.lower())
Token_Sort_Ratio = fuzz.token_sort_ratio(Str1,Str2)
print(Ratio)            #59
print(Partial_Ratio)    #74
print(Token_Sort_Ratio) #100

print('-------------string different order match - different length')
Str1 = "The supreme court case of Nixon vs The United States"
Str2 = "Nixon v. United States"
Ratio = fuzz.ratio(Str1.lower(),Str2.lower())
Partial_Ratio = fuzz.partial_ratio(Str1.lower(),Str2.lower())
Token_Sort_Ratio = fuzz.token_sort_ratio(Str1,Str2)
Token_Set_Ratio = fuzz.token_set_ratio(Str1,Str2)
print(Ratio)            #57
print(Partial_Ratio)    #77
print(Token_Sort_Ratio) #58 
print(Token_Set_Ratio)  #95

print('-------------search string in a list of strings with score/ratio')
str2Match = "apple inc"
strOptions = ["Apple Inc.","apple park","apple incorporated","iphone"]
Ratios = process.extract(str2Match,strOptions)
print(Ratios)
#[('Apple Inc.', 100), ('apple incorporated', 90), ('apple park', 67), ('iphone', 40)]
# You can also select the string with the highest matching percentage
highest = process.extractOne(str2Match,strOptions)
print(highest)
#('Apple Inc.', 100)


No comments:

Post a Comment