337 lines
12 KiB
JavaScript
337 lines
12 KiB
JavaScript
/**
|
|
* Levenshtein-Distanz Algorithmus für Fuzzy-Matching
|
|
* Erkennt Abwandlungen und Tippfehler von Blacklist-Begriffen
|
|
*/
|
|
|
|
/**
|
|
* Berechnet die Levenshtein-Distanz zwischen zwei Strings
|
|
* @param {string} str1 - Erster String
|
|
* @param {string} str2 - Zweiter String
|
|
* @returns {number} - Distanz (0 = identisch, höher = unterschiedlicher)
|
|
*/
|
|
function levenshteinDistance(str1, str2) {
|
|
const len1 = str1.length;
|
|
const len2 = str2.length;
|
|
|
|
// Erstelle Matrix
|
|
const matrix = Array(len2 + 1).fill(null).map(() => Array(len1 + 1).fill(null));
|
|
|
|
// Initialisiere erste Zeile und Spalte
|
|
for (let i = 0; i <= len1; i++) {
|
|
matrix[0][i] = i;
|
|
}
|
|
for (let j = 0; j <= len2; j++) {
|
|
matrix[j][0] = j;
|
|
}
|
|
|
|
// Fülle Matrix
|
|
for (let j = 1; j <= len2; j++) {
|
|
for (let i = 1; i <= len1; i++) {
|
|
const cost = str1[i - 1] === str2[j - 1] ? 0 : 1;
|
|
matrix[j][i] = Math.min(
|
|
matrix[j][i - 1] + 1, // Deletion
|
|
matrix[j - 1][i] + 1, // Insertion
|
|
matrix[j - 1][i - 1] + cost // Substitution
|
|
);
|
|
}
|
|
}
|
|
|
|
return matrix[len2][len1];
|
|
}
|
|
|
|
/**
|
|
* Berechnet die normalisierte Levenshtein-Distanz (0-1)
|
|
* @param {string} str1 - Erster String
|
|
* @param {string} str2 - Zweiter String
|
|
* @returns {number} - Normalisierte Distanz (0 = identisch, 1 = komplett unterschiedlich)
|
|
*/
|
|
function normalizedLevenshteinDistance(str1, str2) {
|
|
const distance = levenshteinDistance(str1, str2);
|
|
const maxLength = Math.max(str1.length, str2.length);
|
|
return maxLength === 0 ? 0 : distance / maxLength;
|
|
}
|
|
|
|
/**
|
|
* Prüft ob ein String ähnlich zu einem Blacklist-Begriff ist
|
|
* @param {string} input - Eingabe-String
|
|
* @param {string} blacklistTerm - Blacklist-Begriff
|
|
* @param {number} threshold - Schwellenwert (0-1, niedriger = strenger)
|
|
* @returns {boolean} - True wenn ähnlich genug
|
|
*/
|
|
function isSimilarToBlacklistTerm(input, blacklistTerm, threshold = 0.3) {
|
|
const normalizedDistance = normalizedLevenshteinDistance(input, blacklistTerm);
|
|
return normalizedDistance <= threshold;
|
|
}
|
|
|
|
/**
|
|
* Findet ähnliche Begriffe in einer Blacklist
|
|
* @param {string} input - Eingabe-String
|
|
* @param {Array} blacklistTerms - Array von Blacklist-Begriffen
|
|
* @param {number} threshold - Schwellenwert (0-1)
|
|
* @returns {Array} - Array von ähnlichen Begriffen mit Distanz
|
|
*/
|
|
function findSimilarTerms(input, blacklistTerms, threshold = 0.3) {
|
|
const similarTerms = [];
|
|
const normalizedInput = input.toLowerCase().trim();
|
|
|
|
// Performance-Optimierung: Frühe Beendigung bei sehr kurzen Strings
|
|
if (normalizedInput.length < 2) {
|
|
return similarTerms;
|
|
}
|
|
|
|
for (const term of blacklistTerms) {
|
|
const normalizedTerm = term.toLowerCase().trim();
|
|
|
|
// Performance-Optimierung: Skip bei zu großer Längendifferenz
|
|
const lengthDiff = Math.abs(normalizedInput.length - normalizedTerm.length);
|
|
const maxLengthDiff = Math.ceil(normalizedInput.length * threshold);
|
|
if (lengthDiff > maxLengthDiff) {
|
|
continue;
|
|
}
|
|
|
|
const distance = normalizedLevenshteinDistance(normalizedInput, normalizedTerm);
|
|
if (distance <= threshold) {
|
|
similarTerms.push({
|
|
term: term,
|
|
distance: distance,
|
|
levenshteinDistance: levenshteinDistance(normalizedInput, normalizedTerm)
|
|
});
|
|
}
|
|
}
|
|
|
|
// Sortiere nach Distanz (niedrigste zuerst)
|
|
return similarTerms.sort((a, b) => a.distance - b.distance);
|
|
}
|
|
|
|
/**
|
|
* Erweiterte Blacklist-Prüfung mit Levenshtein-Distanz und Teilstring-Matching
|
|
* @param {string} firstname - Vorname
|
|
* @param {string} lastname - Nachname
|
|
* @param {Array} blacklistTerms - Array von Blacklist-Begriffen
|
|
* @param {number} threshold - Schwellenwert für Ähnlichkeit (0-1)
|
|
* @returns {Object} - Prüfungsergebnis mit ähnlichen Begriffen
|
|
*/
|
|
function checkWithLevenshtein(firstname, lastname, blacklistTerms, threshold = 0.3) {
|
|
const fullName = `${firstname.toLowerCase().trim()} ${lastname.toLowerCase().trim()}`;
|
|
const firstNameOnly = firstname.toLowerCase().trim();
|
|
const lastNameOnly = lastname.toLowerCase().trim();
|
|
|
|
// Prüfe alle Varianten
|
|
const variants = [fullName, firstNameOnly, lastNameOnly];
|
|
const allSimilarTerms = [];
|
|
|
|
for (const variant of variants) {
|
|
// 1. Direkte Levenshtein-Prüfung
|
|
const similarTerms = findSimilarTerms(variant, blacklistTerms, threshold);
|
|
allSimilarTerms.push(...similarTerms);
|
|
|
|
// 2. Teilstring-Matching: Prüfe alle Wörter im Variant gegen Blacklist
|
|
const words = variant.split(/\s+/);
|
|
for (const word of words) {
|
|
if (word.length >= 2) { // Nur Wörter mit mindestens 2 Zeichen
|
|
const wordSimilarTerms = findSimilarTerms(word, blacklistTerms, threshold);
|
|
allSimilarTerms.push(...wordSimilarTerms);
|
|
}
|
|
}
|
|
|
|
// 3. Teilstring-Matching: Prüfe Blacklist-Begriffe gegen Variant
|
|
for (const blacklistTerm of blacklistTerms) {
|
|
const normalizedTerm = blacklistTerm.toLowerCase().trim();
|
|
if (normalizedTerm.length >= 2) {
|
|
// Prüfe ob Blacklist-Begriff als Teilstring im Variant vorkommt
|
|
if (variant.includes(normalizedTerm)) {
|
|
allSimilarTerms.push({
|
|
term: blacklistTerm,
|
|
distance: 0, // Exakte Teilstring-Übereinstimmung
|
|
levenshteinDistance: 0,
|
|
matchType: 'substring'
|
|
});
|
|
} else {
|
|
// Prüfe Levenshtein für Teilstrings
|
|
const words = variant.split(/\s+/);
|
|
for (const word of words) {
|
|
if (word.length >= 2) {
|
|
const distance = normalizedLevenshteinDistance(word, normalizedTerm);
|
|
if (distance <= threshold) {
|
|
allSimilarTerms.push({
|
|
term: blacklistTerm,
|
|
distance: distance,
|
|
levenshteinDistance: levenshteinDistance(word, normalizedTerm),
|
|
matchType: 'substring-similar'
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Entferne Duplikate und sortiere nach Distanz
|
|
const uniqueSimilarTerms = allSimilarTerms.reduce((acc, current) => {
|
|
const existing = acc.find(item => item.term === current.term);
|
|
if (!existing || current.distance < existing.distance) {
|
|
return acc.filter(item => item.term !== current.term).concat(current);
|
|
}
|
|
return acc;
|
|
}, []);
|
|
|
|
return {
|
|
hasSimilarTerms: uniqueSimilarTerms.length > 0,
|
|
similarTerms: uniqueSimilarTerms.sort((a, b) => a.distance - b.distance),
|
|
bestMatch: uniqueSimilarTerms.length > 0 ? uniqueSimilarTerms[0] : null
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Konfigurierbare Schwellenwerte für verschiedene Kategorien
|
|
*/
|
|
const THRESHOLDS = {
|
|
historical: 0.2, // Sehr streng für historische Begriffe
|
|
offensive: 0.25, // Streng für beleidigende Begriffe
|
|
titles: 0.3, // Normal für Titel
|
|
brands: 0.35, // Etwas lockerer für Marken
|
|
inappropriate: 0.3 // Normal für unpassende Begriffe
|
|
};
|
|
|
|
/**
|
|
* Performance-optimierte Version für große Blacklists
|
|
* Verwendet Trigram-Index für bessere Performance
|
|
*/
|
|
class TrigramIndex {
|
|
constructor() {
|
|
this.index = new Map();
|
|
}
|
|
|
|
/**
|
|
* Erstellt Trigramme aus einem String
|
|
* @param {string} str - Eingabe-String
|
|
* @returns {Array} - Array von Trigrammen
|
|
*/
|
|
createTrigrams(str) {
|
|
const normalized = str.toLowerCase().trim();
|
|
const trigrams = [];
|
|
|
|
for (let i = 0; i < normalized.length - 2; i++) {
|
|
trigrams.push(normalized.substring(i, i + 3));
|
|
}
|
|
|
|
return trigrams;
|
|
}
|
|
|
|
/**
|
|
* Fügt einen Begriff zum Index hinzu
|
|
* @param {string} term - Begriff
|
|
* @param {string} category - Kategorie
|
|
*/
|
|
addTerm(term, category) {
|
|
const trigrams = this.createTrigrams(term);
|
|
for (const trigram of trigrams) {
|
|
if (!this.index.has(trigram)) {
|
|
this.index.set(trigram, []);
|
|
}
|
|
this.index.get(trigram).push({ term, category });
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Findet Kandidaten basierend auf Trigram-Übereinstimmung
|
|
* @param {string} input - Eingabe-String
|
|
* @param {number} minTrigrams - Mindestanzahl übereinstimmender Trigramme
|
|
* @returns {Array} - Array von Kandidaten
|
|
*/
|
|
findCandidates(input, minTrigrams = 1) {
|
|
const inputTrigrams = this.createTrigrams(input);
|
|
const candidateCount = new Map();
|
|
|
|
for (const trigram of inputTrigrams) {
|
|
if (this.index.has(trigram)) {
|
|
for (const candidate of this.index.get(trigram)) {
|
|
const key = `${candidate.term}|${candidate.category}`;
|
|
candidateCount.set(key, (candidateCount.get(key) || 0) + 1);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Filtere Kandidaten mit mindestens minTrigrams Übereinstimmungen
|
|
const candidates = [];
|
|
for (const [key, count] of candidateCount) {
|
|
if (count >= minTrigrams) {
|
|
const [term, category] = key.split('|');
|
|
candidates.push({ term, category });
|
|
}
|
|
}
|
|
|
|
return candidates;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Performance-optimierte Blacklist-Prüfung mit Trigram-Index
|
|
* @param {string} firstname - Vorname
|
|
* @param {string} lastname - Nachname
|
|
* @param {Object} blacklist - Blacklist gruppiert nach Kategorien
|
|
* @param {TrigramIndex} trigramIndex - Trigram-Index
|
|
* @returns {Object} - Prüfungsergebnis
|
|
*/
|
|
function checkWithTrigramIndex(firstname, lastname, blacklist, trigramIndex) {
|
|
const fullName = `${firstname.toLowerCase().trim()} ${lastname.toLowerCase().trim()}`;
|
|
const firstNameOnly = firstname.toLowerCase().trim();
|
|
const lastNameOnly = lastname.toLowerCase().trim();
|
|
|
|
const variants = [fullName, firstNameOnly, lastNameOnly];
|
|
const allSimilarTerms = [];
|
|
|
|
for (const variant of variants) {
|
|
// Finde Kandidaten mit Trigram-Index
|
|
const candidates = trigramIndex.findCandidates(variant, 1);
|
|
|
|
// Prüfe nur Kandidaten mit Levenshtein
|
|
for (const candidate of candidates) {
|
|
const categoryTerms = blacklist[candidate.category] || [];
|
|
const similarTerms = findSimilarTerms(variant, categoryTerms, THRESHOLDS[candidate.category] || 0.3);
|
|
allSimilarTerms.push(...similarTerms);
|
|
}
|
|
}
|
|
|
|
// Entferne Duplikate und sortiere
|
|
const uniqueSimilarTerms = allSimilarTerms.reduce((acc, current) => {
|
|
const existing = acc.find(item => item.term === current.term);
|
|
if (!existing || current.distance < existing.distance) {
|
|
return acc.filter(item => item.term !== current.term).concat(current);
|
|
}
|
|
return acc;
|
|
}, []);
|
|
|
|
return {
|
|
hasSimilarTerms: uniqueSimilarTerms.length > 0,
|
|
similarTerms: uniqueSimilarTerms.sort((a, b) => a.distance - b.distance),
|
|
bestMatch: uniqueSimilarTerms.length > 0 ? uniqueSimilarTerms[0] : null
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Kategorie-spezifische Levenshtein-Prüfung
|
|
* @param {string} firstname - Vorname
|
|
* @param {string} lastname - Nachname
|
|
* @param {Array} blacklistTerms - Array von Blacklist-Begriffen
|
|
* @param {string} category - Kategorie der Begriffe
|
|
* @returns {Object} - Prüfungsergebnis
|
|
*/
|
|
function checkWithCategoryThreshold(firstname, lastname, blacklistTerms, category) {
|
|
const threshold = THRESHOLDS[category] || 0.3;
|
|
return checkWithLevenshtein(firstname, lastname, blacklistTerms, threshold);
|
|
}
|
|
|
|
module.exports = {
|
|
levenshteinDistance,
|
|
normalizedLevenshteinDistance,
|
|
isSimilarToBlacklistTerm,
|
|
findSimilarTerms,
|
|
checkWithLevenshtein,
|
|
checkWithCategoryThreshold,
|
|
checkWithTrigramIndex,
|
|
TrigramIndex,
|
|
THRESHOLDS
|
|
};
|