Here's my C# SQL CLR implementation- just deploy to an SQL server.
In my opinion, for anything involving significant string manipulation, a CLR language is vastly superior to SQL server almost every time (as well as being a lot more elegant).
The JaroWinkler algorithm was ported from the java lingpipe version, so I'd advise checking the output before committing to production.
Also, it's very fast, I was getting 36k rows processing and sorting on JaroWinkler scores in under a second.
JaroWinkler.cs
using System;
using System.Collections.Generic;
using System.Text;
namespace JaroWinkler
{
class JaroWinklerDistance
{
private double mWeightThreshold;
private double mNumChars;
/**
* Construct a basic Jaro string distance without the Winkler
* modifications. See the class documentation above for more information
* on the exact algorithm and its parameters.
*/
public JaroWinklerDistance() {
mNumChars=Double.PositiveInfinity;
mWeightThreshold=0;
}
/**
* Construct a Winkler-modified Jaro string distance with the
* specified weight threshold for refinement and an initial number
* of characters over which to reweight. See the class
* documentation above for more information on the exact algorithm
* and its parameters.
*/
public JaroWinklerDistance(double weightThreshold, int numChars) {
mNumChars = numChars;
mWeightThreshold = weightThreshold;
}
/**
* Returns the Jaro-Winkler distance between the specified character
* sequences. Teh distance is symmetric and will fall in the
* range <code>0</code> (perfect match) to <code>1</code> (no overlap).
* See the class definition above for formal definitions.
*
*
This method is defined to be:
*
* <pre>
* distance(cSeq1,cSeq2) = 1 - proximity(cSeq1,cSeq2)</code></pre>
*
* @param cSeq1 First character sequence to compare.
* @param cSeq2 Second character sequence to compare.
* @return The Jaro-Winkler comparison value for the two character
* sequences.
*/
public double distance(String cSeq1, String cSeq2)
{
return 1.0 - proximity(cSeq1,cSeq2);
}
/**
* Return the Jaro-Winkler comparison value between the specified
* character sequences. The comparison is symmetric and will fall
* in the range <code>0</code> (no match) to <code>1</code>
* (perfect match)inclusive. See the class definition above for
* an exact definition of Jaro-Winkler string comparison.
*
*
The method {@link #distance(CharSequence,CharSequence)} returns
* a distance measure that is one minus the comparison value.
*
* @param cSeq1 First character sequence to compare.
* @param cSeq2 Second character sequence to compare.
* @return The Jaro-Winkler comparison value for the two character
* sequences.
*/
public double proximity(String cSeq1, String cSeq2) {
int len1 = cSeq1.Length;
int len2 = cSeq2.Length;
if (len1 == 0)
return len2 == 0 ? 1.0 : 0.0;
int searchRange = Math.Max(0,Math.Max(len1,len2)/2 - 1);
bool[] matched1 = new bool[len1];
matched1.Initialize();
//Arrays.Fill(matched1,false);
bool[] matched2 = new bool[len2];
matched2.Initialize();
//Arrays.fill(matched2,false);
int numCommon = 0;
for (int i = 0; i < len1; ++i) {
int start = Math.Max(0,i-searchRange);
int end = Math.Min(i+searchRange+1,len2);
for (int j = start; j < end; ++j) {
if (matched2[j]) continue;
if (cSeq1 != cSeq2[j])
continue;
matched1 = true;
matched2[j] = true;
++numCommon;
break;
}
}
if (numCommon == 0) return 0.0;
int numHalfTransposed = 0;
int k = 0;
for (int i = 0; i < len1; ++i) {
if (!matched1) continue;
while (!matched2[k]) { ++k; }
if (cSeq1 != cSeq2[k])
{
++numHalfTransposed;
}
++k;
}
// System.out.println("numHalfTransposed=" + numHalfTransposed);
int numTransposed = numHalfTransposed/2;
// System.out.println("numCommon=" + numCommon
// + " numTransposed=" + numTransposed);
double numCommonD = numCommon;
double weight = (numCommonD/len1
+ numCommonD/len2
+ (numCommon - numTransposed)/numCommonD)/3.0;
if (weight <= mWeightThreshold) return weight;
double max = Math.Min(mNumChars,Math.Min(cSeq1.Length,cSeq2.Length));
int pos = 0;
while (pos < max && cSeq1[pos] == cSeq2[pos])
{
++pos;
}
if (pos == 0) return weight;
return weight + 0.1 * pos * (1.0 - weight);
}
/**
* A constant for the Jaro distance. The value is the same as
* would be returned by the nullary constructor
* <code>JaroWinklerDistance()</code>.
*
*
Instances are thread safe, so this single distance instance
* may be used for all comparisons within an application.
*/
//public static JaroWinklerDistance JARO_DISTANCE = new JaroWinklerDistance();
/**
* A constant for the Jaro-Winkler distance with defaults set as
* in Winkler's papers. The value is the same as would be
* returned by the nullary constructor
* <code>JaroWinklerDistance(0.7,4)</code>.
*
*
Instances are thread safe, so this single distance instance
* may be used for all comparisons within an application.
*/
//public static JaroWinklerDistance JARO_WINKLER_DISTANCE = new JaroWinklerDistance(0.70,4);
}
}
JaroWinklerWrapper.cs
using System;
using System.Data;
using System.Data.SqlClient;
using System.Data.SqlTypes;
using Microsoft.SqlServer.Server;
using JaroWinkler;
public partial class UserDefinedFunctions
{
[Microsoft.SqlServer.Server.SqlFunction]
public static SqlDouble JaroWinklerDistance(string s, string t)
{
JaroWinklerDistance a = new JaroWinklerDistance();
return a.distance(s, t);
}
[Microsoft.SqlServer.Server.SqlFunction]
public static SqlDouble JaroWinklerProximity(string s, string t)
{
JaroWinklerDistance a = new JaroWinklerDistance();
return a.proximity(s, t);
}
};
Then call it from your code something like this
select
Name,
'TestString',
dbo.JaroWinklerDistance('TestString',Name)
from
Names
WHERE
Name is not null