DoubleMetaPhone
This document implements a "sounds like" algorithm developed by Lawrence Philips which he published in the June, 2000 issue of [C/C++ Users Journal.]
Double Metaphone is an improved version of Philips' original Metaphone algorithm.
The UnrealScript implementation has been adapted from Stephen Woodbridge's [PHP implementation] by El Muerte.
Note: Because of the size of the algorithm there might be errors in it.
The algorithm
/** Returns the double metaphone value of in. The array always contains two values. */ static final function array<string> DoubleMetaPhone(coerce string in, optional int mlen) { local array<string> res; local string prim, sec, tmp; local int cur, last, slen; if (mlen == 0) mlen = 4; slen = Len(in); last = slen-1; in = caps(in$" "); // skip this at beginning of word switch (Left(in, 2)) { case "GN": case "KN": case "PN": case "WR": case "PS": cur++; } // Initial 'X' is pronounced 'Z' e.g. 'Xavier' if (Left(in, 1) == "X") { prim $= "S"; // 'Z' maps to 'S' sec $= "S"; cur++; } // main loop while (len(prim) < mlen || len(sec) < mlen) { if (cur >= slen) break; switch (mid(in, cur, 1)) { case "A": case "E": case "I": case "O": case "U": case "Y": if (cur == 0) // all init vowels now map to 'A' { prim $= "A"; sec $= "A"; } cur++; break; case "B": // '-mb', e.g. "dumb", already skipped over ... prim $= "P"; sec $= "P"; if (mid(in, cur+1, 1) == "B") cur += 2; else cur++; break; case "Ç": prim $= "S"; sec $= "S"; cur++; break; case "C": // various gremanic if ((cur > 1) && (!is_vowel(in, cur-2)) && (mid(in, cur - 1, 3) == "ACH") && ((mid(in, cur + 2, 1) != "I") && ((mid(in, cur + 2, 1) != "E") || smid(in, cur - 2, 6, tmp) == "BACHER") || tmp == "MACHER")) { prim $= "K"; sec $= "K"; cur += 2; break; } // special case 'caesar' if ((cur == 0) && (mid(in, cur, 6) == "CAESAR")) { prim $= "S"; sec $= "S"; cur += 2; break; } // italian 'chianti' if (mid(in, cur, 4) == "CHIA") { prim $= "K"; sec $= "K"; cur += 2; break; } if (mid(in, cur, 2) == "CH") { // find 'michael' if ((cur > 0) && (mid(in, cur, 4) == "CHAE")) { prim $= "K"; sec $= "X"; cur += 2; break; } // greek roots e.g. 'chemistry', 'chorus' if ((cur == 0) && (smid(in, cur+1, 5, tmp) == "HARAC" || tmp == "HARIS") || (smid(in, cur+1, 3, tmp) == "HOR" || tmp == "HYM" || tmp == "HIA" || tmp == "HEM") && (mid(in, 0, 5) != "CHORE")) { prim $= "K"; sec $= "K"; cur += 2; break; } // germanic, greek, or otherwise 'ch' for 'kh' sound if (( (smid(in, 0, 4, tmp) == "VAN ") || (tmp == "VON ") || (tmp == "SCH") ) // 'architect' but not 'arch', orchestra', 'orchid' || ( (smid(in, cur-2, 6, tmp) == "ORCHES") || (tmp == "ARCHIT") || (tmp == "ORCHID") ) || (InStr("TS", Mid(in, cur+2, 1)) > -1) || ( ((InStr("AOUE", Mid(in, cur-1, 1)) > -1) || (cur == 0) ) // e.g. 'wachtler', 'weschsler', but not 'tichner' && (InStr("LRNMBHFVW", Mid(in, cur+2, 1)) > -1)) ) { prim $= "K"; sec $= "K"; } else { if (cur > 0) { if (mid(in, 0, 2) == "MC") { // e.g. 'McHugh' prim $= "K"; sec $= "K"; } else { prim $= "X"; sec $= "K"; } } else { prim $= "X"; sec $= "X"; } } cur += 2; break; } // e.g. 'czerny' if (mid(in, cur, 2) == "CZ" && mid(in, cur-2, 4) != "WICZ") { prim $= "S"; sec $= "X"; cur += 2; break; } // e.g. 'focaccia' if (mid(in, cur + 1, 3) == "CIA") { prim $= "X"; sec $= "X"; cur += 3; break; } // double 'C', but not McClellan' if (mid(in, cur, 2) == "CC" && !((cur == 1) && (mid(in, 0, 1) == "M"))) { // 'bellocchio' but not 'bacchus' if ((InStr("IEH", mid(in, cur + 2, 1)) > -1) && (mid(in, cur + 2, 2) != "HU")) { // 'accident', 'accede', 'succeed' if (((cur == 1) && (mid(in, cur - 1, 1) == "A")) || (smid(in, cur-1, 5, tmp) == "UCCEE") || (tmp == "UCCES")) { prim $= "KS"; sec $= "KS"; // 'bacci', 'bertucci', other italian } else { prim $= "X"; sec $= "X"; } cur += 3; break; } else { // Pierce's rule prim $= "K"; sec $= "K"; cur += 2; break; } } if ((smid(in, cur, 2, tmp) == "CK") || (tmp == "CG") || (tmp == "CQ")) { prim $= "K"; sec $= "K"; cur += 2; break; } if ((smid(in, cur, 2, tmp) == "CI") || (tmp == "CE") || (tmp == "CY")) { // italian vs. english if ((smid(in, cur, 3, tmp) == "CIO") || (tmp == "CIE") || (tmp == "CIA")) { prim $= "S"; sec $= "X"; } else { prim $= "S"; sec $= "S"; } cur += 2; break; } // else prim $= "K"; sec $= "K"; // name sent in 'mac caffrey', 'mac gregor' if ((smid(in, cur+1, 2, tmp) == " C") || (tmp == " Q") || (tmp == " G")) { cur += 3; } else { if ((InStr("CKQ", mid(in, cur + 1, 1)) > -1) && !((smid(in, cur+1, 2, tmp) == "CE") || (tmp == "CI")) ) { cur += 2; } else { cur++; } } break; case "D": if (mid(in, cur, 2) == "DG") { if (InStr("IEY", mid(in, cur + 2, 1)) > -1) { // e.g. 'edge' prim $= "J"; sec $= "J"; cur += 3; break; } else { // e.g. 'edgar' prim $= "TK"; sec $= "TK"; cur += 2; break; } } if ((smid(in, cur, 2, tmp) == "DT") || (tmp == "DD")) { prim $= "T"; sec $= "T"; cur += 2; break; } // else prim $= "T"; sec $= "T"; cur++; break; case "F": if (mid(in, cur + 1, 1) == "F") cur += 2; else cur++; prim $= "F"; sec $= "F"; break; // TODO: optimize case "G": if (mid(in, cur + 1, 1) == "H") { if ((cur > 0) && !is_vowel(in, cur-1)) { prim $= "K"; sec $= "K"; cur += 2; break; } if (cur < 3) { // 'ghislane', 'ghiradelli' if (cur == 0) { if (mid(in, cur + 2, 1) == "I") { prim $= "J"; sec $= "J"; } else { prim $= "K"; sec $= "K"; } cur += 2; break; } } // Parker's rule (with some further refinements) - e.g. 'hugh' if (((cur > 1) && (InStr("BHD", mid(in, cur - 2, 1)) > -1)) // e.g. 'bough' || ((cur > 2) && (InStr("BHD", mid(in, cur - 3, 1)) > -1)) // e.g. 'broughton' || ((cur > 3) && (InStr("BH", mid(in, cur - 4, 1)) > -1))) { cur += 2; break; } else { // e.g. 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough' if ((cur > 2) && (Mid(in, cur - 1, 1) == "U") && (InStr("CGLRT", mid(in, cur - 3, 1)) > -1)) { prim $= "F"; sec $= "F"; } else { if ((cur > 0) && mid(in, cur - 1, 1) != "I") { prim $= "K"; sec $= "K"; } cur += 2; break; } } } if (mid(in, cur + 1, 1) == "N") { if ((cur == 1) && is_vowel(in, 0) && !Slavo_Germanic(in)) { prim $= "KN"; sec $= "N"; } else { // not e.g. 'cagney' if ((mid(in, cur + 2, 2) != "EY") && (mid(in, cur + 1) != "Y") && !Slavo_Germanic(in)) { prim $= "N"; sec $= "KN"; } else { prim $= "KN"; sec $= "KN"; } } cur += 2; break; } // 'tagliaro' if ((mid(in, cur + 1, 2) == "LI") && !Slavo_Germanic(in)) { prim $= "KL"; sec $= "L"; cur += 2; break; } // -ges-, -gep-, -gel- at beginning if ((cur == 0) && ((mid(in, cur + 1, 1) == "Y") || ((smid(in, cur + 1, 2, tmp) == "ES") || (tmp == "EP") || (tmp == "EB") || (tmp == "EL") || (tmp == "EY") || (tmp == "IB") || (tmp == "IL") || (tmp == "IN") || (tmp == "IE") || (tmp == "EI") || (tmp == "ER")) ) ) { prim $= "K"; sec $= "J"; cur += 2; break; } // -ger-, -gy- if (((mid(in, cur + 1, 2) == "ER") || (mid(in, cur + 1, 1) == "Y")) && !(smid(in, 0, 6, tmp) == "DANGER" || tmp == "RANGER" || tmp == "MANGER" ) && (InStr("EI", mid(in, cur-1, 1)) == -1) && !(smid(in, cur-1, 3, tmp) == "RGY" || tmp == "OGY" )) { prim $= "K"; sec $= "J"; cur += 2; break; } // italian e.g. 'biaggi' if ((InStr("EIY", mid(in, cur + 1, 1)) > -1) || smid(in, cur-1, 4, tmp) == "AGGI" || tmp == "OGGI") { // obvious germanic if (((smid(in, 0, 4, tmp) == "VAN " || tmp == "VON ") || (mid(in, 0, 3) == "SCH")) || (mid(in, cur + 1, 2) == "ET")) { prim $= "K"; sec $= "K"; } else { // always soft if french ending if (mid(in, cur + 1, 4) == "IER ") { prim $= "J"; sec $= "J"; } else { prim $= "J"; sec $= "K"; } } cur += 2; break; } if (mid(in, cur +1, 1) == "G") cur += 2; else cur++; prim $= "K"; sec $= "K"; break; case "H": // only keep if first & before vowel or btw. 2 vowels if (((cur == 0) || is_vowel(in, cur - 1)) && is_vowel(in, cur + 1)) { prim $= "H"; sec $= "H"; cur += 2; } else cur++; break; case "J": // obvious spanish, 'jose', 'san jacinto' if ((mid(in, cur, 4) == "JOSE") || (mid(in, 0, 4) == "SAN ")) { if (((cur == 0) && (mid(in, cur + 4, 1) == " ")) || (mid(in, 0, 4) == "SAN ")) { prim $= "H"; sec $= "H"; } else { prim $= "J"; sec $= "H"; } cur++; break; } if ((cur == 0) && (mid(in, cur, 4) != "JOSE")) { prim $= "J"; // Yankelovich/Jankelowicz sec $= "A"; } else { // spanish pron. of .e.g. 'bajador' if (is_vowel(in, cur - 1) && !Slavo_Germanic(in) && ((mid(in, cur + 1, 1) == "A") || (mid(in, cur + 1, 1) == "O"))) { prim $= "J"; sec $= "H"; } else { if (cur == last) { prim $= "J"; sec $= ""; } else { if ((InStr("LTKSNMBZ", mid(in, cur + 1, 1)) == -1) && (InStr("SKL", mid(in, cur - 1, 1)) == -1) ) { prim $= "J"; sec $= "J"; } } } } if (mid(in, cur + 1, 1) == "J") // it could happen cur += 2; else cur++; break; case "K": if (mid(in, cur + 1, 1) == "K") cur += 2; else cur++; prim $= "K"; sec $= "K"; break; case "L": if (mid(in, cur + 1, 1) == "L") { // spanish e.g. 'cabrillo', 'gallegos' if (((cur == (slen - 3)) && (smid(in, cur-1, 4, tmp) == "ILLO" || tmp == "ILLA" || tmp == "ALLE" ) || ( ((smid(in, last-1, 2, tmp) == "AS" || tmp == "OS") || (InStr("AO", mid(in, last, 1)) > -1) ) && (mid(in, cur - 1, 4) == "ALLE") ) )) { prim $= "L"; sec $= ""; cur += 2; break; } cur += 2; } else cur++; prim $= "L"; sec $= "L"; break; case "M": if (((mid(in, cur-1, 3) == "UMB") && (((cur + 1) == last) || (mid(in, cur+2, 2) == "ER"))) // 'dumb', 'thumb' || (mid(in, cur + 1, 1) == "M")) { cur += 2; } else { cur++; } prim $= "M"; sec $= "M"; break; case "N": if (mid(in, cur + 1, 1) == "N") cur += 2; else cur++; prim $= "N"; sec $= "N"; break; case "Ñ": cur++; prim $= "N"; sec $= "N"; break; case "P": if (mid(in, cur + 1, 1) == "H") { cur += 2; prim $= "F"; sec $= "F"; break; } // also account for "campbell" and "raspberry" if (InStr("PB", Mid(in, cur + 1, 1)) > -1) cur += 2; else cur++; prim $= "P"; sec $= "P"; break; case "Q": if (mid(in, cur + 1, 1) == "Q") cur += 2; else cur++; prim $= "K"; sec $= "K"; break; case "R": // french e.g. 'rogier', but exclude 'hochmeier' if ((cur == last) && !Slavo_Germanic(in) && (mid(in, cur - 2, 2) == "IE") && !(smid(in, cur-4, 2, tmp) == "ME" || tmp == "MA" )) { prim $= ""; sec $= "R"; } else { prim $= "R"; sec $= "R"; } if (mid(in, cur + 1, 1) == "R") cur += 2; else cur++; break; case "S": // special cases 'island', 'isle', 'carlisle', 'carlysle' if ((smid(in, cur - 1, 3, tmp) == "ISL") || (tmp == "YSL")) { cur++; break; } // special case 'sugar-' if ((cur == 0) && (mid(in, cur, 5) == "SUGAR")) { prim $= "X"; sec $= "S"; cur++; break; } if (mid(in, cur, 2) == "SH") { // germanic if (smid(in, cur + 1, 4, tmp) == "HEIM" || tmp == "HOEK" || tmp == "HOLM" || tmp == "HOLZ") { prim $= "S"; sec $= "S"; } else { prim $= "X"; sec $= "X"; } cur += 2; break; } // italian & armenian if (smid(in, cur, 3, tmp) == "SIO" || tmp == "SIA" || mid(in, cur, 4) == "SIAN") { if (!Slavo_Germanic(in)) { prim $= "S"; sec $= "X"; } else { prim $= "S"; sec $= "S"; } cur += 3; break; } // german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider' // also, -sz- in slavic language altho in hungarian it is pronounced 's' if (( (cur == 0) && (InStr("MNLW", mid(in, cur + 1, 1)) > -1) ) || mid(in, cur + 1, 1) == "Z") { prim $= "S"; sec $= "X"; if (mid(in, cur + 1, 1) == "Z") cur += 2; else cur++; break; } if (mid(in, cur, 2) == "SC") { // Schlesinger's rule if (mid(in, cur + 2, 1) == "H") { // dutch origin, e.g. 'school', 'schooner' if (smid(in, cur + 3, 2, tmp) == "OO" || tmp == "ER" || tmp == "EN" || tmp == "UY" || tmp == "ED" || tmp == "EM") { // 'schermerhorn', 'schenker' if (smid(in, cur + 3, 2, tmp) == "ER" || tmp == "EN") { prim $= "X"; sec $= "SK"; } else { prim $= "SK"; sec $= "SK"; } cur += 3; break; } else { if ((cur == 0) && !is_vowel(in, 3) && (mid(in, cur + 3, 1) != "W")) { prim $= "X"; sec $= "S"; } else { prim $= "X"; sec $= "X"; } cur += 3; break; } if (InStr("IEY", mid(in, cur + 2, 1)) > -1) { prim $= "S"; sec $= "S"; cur += 3; break; } } // else prim $= "SK"; sec $= "SK"; cur += 3; break; } // french e.g. 'resnais', 'artois' if ((cur == last) && ( smid(in, cur-2, 2, tmp) == "AI" || tmp == "OI" )) { prim $= ""; sec $= "S"; } else { prim $= "S"; sec $= "S"; } if (InStr("SZ", mid(in, cur + 1, 1)) > -1) cur += 2; else cur++; break; case "T": if (mid(in, cur, 4) == "TION") { prim $= "X"; sec $= "X"; cur += 3; break; } if (smid(in, cur, 3, tmp) == "TIA" || tmp == "TCH") { prim $= "X"; sec $= "X"; cur += 3; break; } if ((mid(in, cur, 2) == "TH") || (mid(in, cur, 3) == "TTH")) { // special case 'thomas', 'thames' or germanic if (smid(in, cur + 2, 2, tmp) == "OM" || tmp == "AM" || smid(in, 0, 4, tmp) == "VAN" || tmp == "VON" || mid(in, 0, 3) == "SCH") { prim $= "T"; sec $= "T"; } else { prim $= "O"; sec $= "T"; } cur += 2; break; } if (InStr("TD", mid(in, cur + 1, 1)) > -1) cur += 2; else cur++; prim $= "T"; sec $= "T"; break; case "V": if (mid(in, cur + 1, 1) == "V") cur += 2; else cur++; prim $= "F"; sec $= "F"; break; case "W": // can also be in middle of word if (mid(in, cur, 2) == "WR") { prim $= "R"; sec $= "R"; cur += 2; break; } if ((cur == 0) && (is_vowel(in, cur + 1) || mid(in, cur, 2) == "WH")) { // Wasserman should match Vasserman if (is_vowel(in, cur + 1)) { prim $= "A"; sec $= "F"; } else { // need Uomo to match Womo prim $= "A"; sec $= "A"; } } // Arnow should match Arnoff if (((cur == last) && is_vowel(in, cur - 1)) || smid(in, cur - 1, 5, tmp) =="EWSKI" || tmp == "EWSKY" || tmp == "OWSKI" || tmp == "OWSKY" || mid(in, 0, 3) == "SCH") { prim $= ""; sec $= "F"; cur++; break; } // polish e.g. 'filipowicz' if (smid(in, cur, 4, tmp) == "WICZ" || tmp == "WITZ") { prim $= "TS"; sec $= "FX"; cur += 4; break; } // else skip it cur++; break; case "X": // french e.g. breaux if (!((cur == last) && (smid(in, cur - 3, 3, tmp) == "IAU" || tmp == "EAU" || smid(in, cur - 2, 2, tmp) == "AU" || tmp == "OU"))) { prim $= "KS"; sec $= "KS"; } if (InStr("CX", mid(in, cur + 1, 1)) > -1) cur += 2; else cur++; break; case "Z": // chinese pinyin e.g. 'zhao' if (mid(in, cur + 1, 1) == "H") { prim $= "J"; sec $= "J"; cur += 2; break; } else if (smid(in, cur + 1, 2, tmp) == "ZO" || tmp == "ZI" || tmp == "ZA" || (Slavo_Germanic(in) && ((cur > 0) && mid(in, cur - 1, 1) != "T"))) { prim $= "S"; sec $= "TS"; } else { prim $= "S"; sec $= "S"; } if (mid(in, cur + 1, 1) == "Z") cur += 2; else cur++; break; default: cur++; } } res[0] = Left(prim, mlen); res[1] = Left(sec, mlen); return res; }
Helper functions
/** Same as mid, except that is has a 4th param that will also get the result. <br /> Use this for caching the result. */ static final function string smid(coerce string in, int off, int len, out string s) { s = mid(in, off, len); return s; } /** returns true if the character at pos is a vowel. */ static final function bool is_vowel(coerce string in, int pos) { return (InStr("AEIOUY", mid(in, pos, 1)) > -1); } /** helper function for the metaphone algorithm to determain if this word is "slave germanic" */ static final function bool Slavo_Germanic(coerce string in) { return (InStr(in, "W") > -1) || (InStr(in, "K") > -1) || (InStr(in, "CZ") > -1) || (InStr(in, "WITZ") > -1); }
Copyright
Copyright 2004, Michiel Hendriks <elmuerte@drunksnipers.com> All rights reserved. This UnrealScript translation is based heavily on the PHP implementation by Stephen Woodbridge <woodbri@swoodbridge.com>, which in turn is based on the C implementation by Maurice Aubrey <maurice@hevanet.com>, which in turn is based heavily on the C++ implementation by Lawrence Philips and incorporates several bug fixes courtesy of Kevin Atkinson <kevina@users.sourceforge.net>. This module is free software; you may redistribute it and/or modify it under the same terms as Perl itself.
Note that that page currently appears to be blank, the Perl Artistic License is almost similar to the [Artistic License]
Wormbo: Erm... Perl?
El Muerte: Yeah go figure, just pasting a long the license notice. The implementation of Maurice Aubrey has been released under that license, and so is the PHP implementation, thus this implementation should also be released under the same license. The license isn't very restrictive, it allows you to do even more than the LesserOpenUnrealModLicense would do.
Related Topics
- SoundEx
- [More about metaphone] – and implementations in other languages