Update
This commit is contained in:
335
config/blacklist-db.js
Normal file
335
config/blacklist-db.js
Normal file
@@ -0,0 +1,335 @@
|
||||
/**
|
||||
* Datenbankbasierte Blacklist für unerwünschte Namen
|
||||
* Lädt und verwaltet Blacklist-Einträge aus der Datenbank
|
||||
*/
|
||||
|
||||
const { checkWithCategoryThreshold, checkWithTrigramIndex, TrigramIndex, THRESHOLDS } = require('./levenshtein');
|
||||
|
||||
// Datenbankverbindung direkt hier implementieren
|
||||
const { Pool } = require('pg');
|
||||
|
||||
const pool = new Pool({
|
||||
user: process.env.DB_USER || 'postgres',
|
||||
host: process.env.DB_HOST || 'localhost',
|
||||
database: process.env.DB_NAME || 'ninjaserver',
|
||||
password: process.env.DB_PASSWORD || 'password',
|
||||
port: process.env.DB_PORT || 5432,
|
||||
});
|
||||
|
||||
// Trigram-Index für Performance-Optimierung
|
||||
let trigramIndex = null;
|
||||
let blacklistCache = null;
|
||||
let lastCacheUpdate = 0;
|
||||
const CACHE_TTL = 5 * 60 * 1000; // 5 Minuten
|
||||
|
||||
/**
|
||||
* Lädt alle Blacklist-Einträge aus der Datenbank mit Caching
|
||||
* @returns {Object} - Blacklist gruppiert nach Kategorien
|
||||
*/
|
||||
async function loadBlacklistFromDB() {
|
||||
const now = Date.now();
|
||||
|
||||
// Verwende Cache falls verfügbar und nicht abgelaufen
|
||||
if (blacklistCache && (now - lastCacheUpdate) < CACHE_TTL) {
|
||||
return blacklistCache;
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await pool.query(
|
||||
'SELECT term, category FROM blacklist_terms ORDER BY category, term'
|
||||
);
|
||||
|
||||
const blacklist = {
|
||||
historical: [],
|
||||
offensive: [],
|
||||
titles: [],
|
||||
brands: [],
|
||||
inappropriate: [],
|
||||
racial: [],
|
||||
religious: [],
|
||||
disability: [],
|
||||
leetspeak: [],
|
||||
cyberbullying: [],
|
||||
drugs: [],
|
||||
violence: []
|
||||
};
|
||||
|
||||
// Erstelle neuen Trigram-Index
|
||||
trigramIndex = new TrigramIndex();
|
||||
|
||||
result.rows.forEach(row => {
|
||||
if (blacklist[row.category]) {
|
||||
blacklist[row.category].push(row.term);
|
||||
// Füge zum Trigram-Index hinzu
|
||||
trigramIndex.addTerm(row.term, row.category);
|
||||
}
|
||||
});
|
||||
|
||||
// Cache aktualisieren
|
||||
blacklistCache = blacklist;
|
||||
lastCacheUpdate = now;
|
||||
|
||||
return blacklist;
|
||||
} catch (error) {
|
||||
console.error('Error loading blacklist from database:', error);
|
||||
// Fallback zur statischen Blacklist
|
||||
return getStaticBlacklist();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Statische Blacklist als Fallback
|
||||
*/
|
||||
function getStaticBlacklist() {
|
||||
return {
|
||||
historical: [
|
||||
'adolf', 'hitler', 'adolf hitler', 'adolfhittler',
|
||||
'mussolini', 'benito', 'benito mussolini',
|
||||
'stalin', 'joseph stalin', 'mao', 'mao zedong'
|
||||
],
|
||||
offensive: [
|
||||
'satan', 'luzifer', 'teufel', 'devil',
|
||||
'hurensohn', 'wichser', 'fotze', 'arschloch',
|
||||
'idiot', 'dummkopf', 'trottel', 'schwachsinnig',
|
||||
'nazi', 'faschist', 'rassist'
|
||||
],
|
||||
titles: [
|
||||
'lord', 'lady', 'sir', 'dame',
|
||||
'prinz', 'prinzessin', 'prince', 'princess',
|
||||
'könig', 'königin', 'king', 'queen',
|
||||
'doktor', 'professor', 'dr', 'prof'
|
||||
],
|
||||
brands: [
|
||||
'mcdonald', 'coca cola', 'cocacola', 'pepsi',
|
||||
'nike', 'adidas', 'puma', 'reebok',
|
||||
'bmw', 'mercedes', 'audi', 'volkswagen'
|
||||
],
|
||||
inappropriate: [
|
||||
'sex', 'porn', 'porno', 'fuck', 'shit',
|
||||
'bitch', 'whore', 'prostitute',
|
||||
'drug', 'cocaine', 'heroin', 'marijuana'
|
||||
],
|
||||
racial: [],
|
||||
religious: [],
|
||||
disability: [],
|
||||
leetspeak: [],
|
||||
cyberbullying: [],
|
||||
drugs: [],
|
||||
violence: []
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Prüft ob ein Name in der Blacklist steht (exakte Übereinstimmung)
|
||||
* @param {string} firstname - Vorname
|
||||
* @param {string} lastname - Nachname
|
||||
* @returns {Object} - {isBlocked: boolean, reason: string, category: string}
|
||||
*/
|
||||
async function checkNameAgainstBlacklist(firstname, lastname) {
|
||||
if (!firstname || !lastname) {
|
||||
return { isBlocked: false, reason: '', category: '' };
|
||||
}
|
||||
|
||||
try {
|
||||
const blacklist = await loadBlacklistFromDB();
|
||||
const fullName = `${firstname.toLowerCase()} ${lastname.toLowerCase()}`;
|
||||
const firstNameOnly = firstname.toLowerCase();
|
||||
const lastNameOnly = lastname.toLowerCase();
|
||||
|
||||
// Alle Blacklist-Einträge in einem Array sammeln
|
||||
const allBlacklistEntries = [];
|
||||
|
||||
Object.entries(blacklist).forEach(([category, entries]) => {
|
||||
entries.forEach(entry => {
|
||||
allBlacklistEntries.push({
|
||||
term: entry,
|
||||
category: category,
|
||||
reason: getCategoryReason(category)
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
// 1. Exakte Übereinstimmung prüfen
|
||||
for (const entry of allBlacklistEntries) {
|
||||
const term = entry.term.toLowerCase();
|
||||
|
||||
// Vollständiger Name
|
||||
if (fullName.includes(term) || term.includes(fullName)) {
|
||||
return {
|
||||
isBlocked: true,
|
||||
reason: entry.reason,
|
||||
category: entry.category,
|
||||
matchedTerm: entry.term,
|
||||
matchType: 'exact'
|
||||
};
|
||||
}
|
||||
|
||||
// Vorname allein
|
||||
if (firstNameOnly.includes(term) || term.includes(firstNameOnly)) {
|
||||
return {
|
||||
isBlocked: true,
|
||||
reason: entry.reason,
|
||||
category: entry.category,
|
||||
matchedTerm: entry.term,
|
||||
matchType: 'exact'
|
||||
};
|
||||
}
|
||||
|
||||
// Nachname allein
|
||||
if (lastNameOnly.includes(term) || term.includes(lastNameOnly)) {
|
||||
return {
|
||||
isBlocked: true,
|
||||
reason: entry.reason,
|
||||
category: entry.category,
|
||||
matchedTerm: entry.term,
|
||||
matchType: 'exact'
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Levenshtein-Distanz prüfen (Fuzzy-Matching)
|
||||
// Verwende Trigram-Index für bessere Performance bei großen Blacklists
|
||||
let levenshteinResult;
|
||||
if (trigramIndex && Object.values(blacklist).flat().length > 100) {
|
||||
// Performance-optimierte Version für große Blacklists
|
||||
levenshteinResult = checkWithTrigramIndex(firstname, lastname, blacklist, trigramIndex);
|
||||
} else {
|
||||
// Standard-Version für kleine Blacklists
|
||||
for (const [category, entries] of Object.entries(blacklist)) {
|
||||
const categoryResult = checkWithCategoryThreshold(firstname, lastname, entries, category);
|
||||
if (categoryResult.hasSimilarTerms) {
|
||||
levenshteinResult = categoryResult;
|
||||
break; // Frühe Beendigung bei erstem Match
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (levenshteinResult && levenshteinResult.hasSimilarTerms) {
|
||||
const bestMatch = levenshteinResult.bestMatch;
|
||||
return {
|
||||
isBlocked: true,
|
||||
reason: `${getCategoryReason(bestMatch.category || 'unknown')} (ähnlich)`,
|
||||
category: bestMatch.category || 'unknown',
|
||||
matchedTerm: bestMatch.term,
|
||||
matchType: 'similar',
|
||||
similarity: bestMatch.distance,
|
||||
levenshteinDistance: bestMatch.levenshteinDistance
|
||||
};
|
||||
}
|
||||
|
||||
return { isBlocked: false, reason: '', category: '' };
|
||||
} catch (error) {
|
||||
console.error('Error checking name against blacklist:', error);
|
||||
return { isBlocked: false, reason: '', category: '' };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gibt eine benutzerfreundliche Begründung für die Kategorie zurück
|
||||
*/
|
||||
function getCategoryReason(category) {
|
||||
const reasons = {
|
||||
historical: 'Historisch belasteter Name',
|
||||
offensive: 'Beleidigender oder anstößiger Begriff',
|
||||
titles: 'Titel oder Berufsbezeichnung',
|
||||
brands: 'Markenname',
|
||||
inappropriate: 'Unpassender Begriff',
|
||||
racial: 'Rassistischer oder ethnisch beleidigender Begriff',
|
||||
religious: 'Religiös beleidigender oder blasphemischer Begriff',
|
||||
disability: 'Beleidigender Begriff bezüglich Behinderungen',
|
||||
leetspeak: 'Verschleierter beleidigender Begriff',
|
||||
cyberbullying: 'Cyberbullying oder Online-Belästigung',
|
||||
drugs: 'Drogenbezogener Begriff',
|
||||
violence: 'Gewalt- oder bedrohungsbezogener Begriff'
|
||||
};
|
||||
|
||||
return reasons[category] || 'Unzulässiger Begriff';
|
||||
}
|
||||
|
||||
/**
|
||||
* Fügt einen neuen Begriff zur Blacklist hinzu
|
||||
* @param {string} term - Der hinzuzufügende Begriff
|
||||
* @param {string} category - Die Kategorie
|
||||
* @param {string} createdBy - Wer hat den Begriff hinzugefügt
|
||||
*/
|
||||
async function addToBlacklist(term, category, createdBy = 'admin') {
|
||||
try {
|
||||
await pool.query(
|
||||
'INSERT INTO blacklist_terms (term, category, created_by) VALUES ($1, $2, $3) ON CONFLICT (term, category) DO NOTHING',
|
||||
[term.toLowerCase(), category, createdBy]
|
||||
);
|
||||
|
||||
// Cache invalidieren
|
||||
invalidateCache();
|
||||
|
||||
return true;
|
||||
} catch (error) {
|
||||
console.error('Error adding to blacklist:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Entfernt einen Begriff aus der Blacklist
|
||||
* @param {string} term - Der zu entfernende Begriff
|
||||
* @param {string} category - Die Kategorie
|
||||
*/
|
||||
async function removeFromBlacklist(term, category) {
|
||||
try {
|
||||
const result = await pool.query(
|
||||
'DELETE FROM blacklist_terms WHERE term = $1 AND category = $2',
|
||||
[term.toLowerCase(), category]
|
||||
);
|
||||
|
||||
// Cache invalidieren
|
||||
invalidateCache();
|
||||
|
||||
return result.rowCount > 0;
|
||||
} catch (error) {
|
||||
console.error('Error removing from blacklist:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Invalidiert den Blacklist-Cache
|
||||
*/
|
||||
function invalidateCache() {
|
||||
blacklistCache = null;
|
||||
trigramIndex = null;
|
||||
lastCacheUpdate = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gibt die komplette Blacklist zurück (für Admin-Zwecke)
|
||||
*/
|
||||
async function getBlacklist() {
|
||||
return await loadBlacklistFromDB();
|
||||
}
|
||||
|
||||
/**
|
||||
* Synchronisiert die statische Blacklist mit der Datenbank
|
||||
* (Nur für Initial-Setup oder Migration)
|
||||
*/
|
||||
async function syncStaticBlacklist() {
|
||||
const staticBlacklist = getStaticBlacklist();
|
||||
|
||||
for (const [category, terms] of Object.entries(staticBlacklist)) {
|
||||
for (const term of terms) {
|
||||
try {
|
||||
await addToBlacklist(term, category, 'system');
|
||||
} catch (error) {
|
||||
console.error(`Error syncing term ${term} in category ${category}:`, error);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
checkNameAgainstBlacklist,
|
||||
addToBlacklist,
|
||||
removeFromBlacklist,
|
||||
getBlacklist,
|
||||
loadBlacklistFromDB,
|
||||
syncStaticBlacklist
|
||||
};
|
||||
176
config/blacklist.js
Normal file
176
config/blacklist.js
Normal file
@@ -0,0 +1,176 @@
|
||||
/**
|
||||
* Blacklist für unerwünschte Namen
|
||||
* Basierend auf deutschen Namensregeln und allgemeinen Richtlinien
|
||||
*/
|
||||
|
||||
const BLACKLIST = {
|
||||
// Historisch belastete Namen
|
||||
historical: [
|
||||
'adolf', 'hitler', 'adolf hitler', 'adolfhittler', 'adolfhittler',
|
||||
'mussolini', 'benito', 'benito mussolini',
|
||||
'stalin', 'joseph stalin',
|
||||
'mao', 'mao zedong',
|
||||
'pol pot', 'polpot',
|
||||
'saddam', 'saddam hussein',
|
||||
'osama', 'osama bin laden',
|
||||
'kim jong', 'kim jong il', 'kim jong un'
|
||||
],
|
||||
|
||||
// Beleidigende/anstößige Begriffe
|
||||
offensive: [
|
||||
'satan', 'luzifer', 'teufel', 'devil',
|
||||
'hurensohn', 'wichser', 'fotze', 'arschloch',
|
||||
'idiot', 'dummkopf', 'trottel', 'schwachsinnig',
|
||||
'nazi', 'faschist', 'rassist',
|
||||
'terrorist', 'mörder', 'killer'
|
||||
],
|
||||
|
||||
// Titel und Berufsbezeichnungen
|
||||
titles: [
|
||||
'lord', 'lady', 'sir', 'dame',
|
||||
'prinz', 'prinzessin', 'prince', 'princess',
|
||||
'könig', 'königin', 'king', 'queen',
|
||||
'kaiser', 'kaiserin', 'emperor', 'empress',
|
||||
'doktor', 'professor', 'dr', 'prof',
|
||||
'pastor', 'pfarrer', 'bischof', 'priester',
|
||||
'richter', 'anwalt', 'notar'
|
||||
],
|
||||
|
||||
// Markennamen (Beispiele)
|
||||
brands: [
|
||||
'mcdonald', 'coca cola', 'cocacola', 'pepsi',
|
||||
'nike', 'adidas', 'puma', 'reebok',
|
||||
'bmw', 'mercedes', 'audi', 'volkswagen',
|
||||
'apple', 'microsoft', 'google', 'facebook',
|
||||
'samsung', 'sony', 'panasonic'
|
||||
],
|
||||
|
||||
// Unpassende Begriffe
|
||||
inappropriate: [
|
||||
'sex', 'porn', 'porno', 'fuck', 'shit',
|
||||
'bitch', 'whore', 'prostitute',
|
||||
'drug', 'cocaine', 'heroin', 'marijuana',
|
||||
'bomb', 'explosive', 'weapon', 'gun'
|
||||
]
|
||||
};
|
||||
|
||||
/**
|
||||
* Prüft ob ein Name in der Blacklist steht
|
||||
* @param {string} firstname - Vorname
|
||||
* @param {string} lastname - Nachname
|
||||
* @returns {Object} - {isBlocked: boolean, reason: string, category: string}
|
||||
*/
|
||||
function checkNameAgainstBlacklist(firstname, lastname) {
|
||||
if (!firstname || !lastname) {
|
||||
return { isBlocked: false, reason: '', category: '' };
|
||||
}
|
||||
|
||||
const fullName = `${firstname.toLowerCase()} ${lastname.toLowerCase()}`;
|
||||
const firstNameOnly = firstname.toLowerCase();
|
||||
const lastNameOnly = lastname.toLowerCase();
|
||||
|
||||
// Alle Blacklist-Einträge in einem Array sammeln
|
||||
const allBlacklistEntries = [];
|
||||
|
||||
Object.entries(BLACKLIST).forEach(([category, entries]) => {
|
||||
entries.forEach(entry => {
|
||||
allBlacklistEntries.push({
|
||||
term: entry,
|
||||
category: category,
|
||||
reason: getCategoryReason(category)
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
// Prüfung durchführen
|
||||
for (const entry of allBlacklistEntries) {
|
||||
const term = entry.term.toLowerCase();
|
||||
|
||||
// Vollständiger Name
|
||||
if (fullName.includes(term) || term.includes(fullName)) {
|
||||
return {
|
||||
isBlocked: true,
|
||||
reason: entry.reason,
|
||||
category: entry.category,
|
||||
matchedTerm: entry.term
|
||||
};
|
||||
}
|
||||
|
||||
// Vorname allein
|
||||
if (firstNameOnly.includes(term) || term.includes(firstNameOnly)) {
|
||||
return {
|
||||
isBlocked: true,
|
||||
reason: entry.reason,
|
||||
category: entry.category,
|
||||
matchedTerm: entry.term
|
||||
};
|
||||
}
|
||||
|
||||
// Nachname allein
|
||||
if (lastNameOnly.includes(term) || term.includes(lastNameOnly)) {
|
||||
return {
|
||||
isBlocked: true,
|
||||
reason: entry.reason,
|
||||
category: entry.category,
|
||||
matchedTerm: entry.term
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return { isBlocked: false, reason: '', category: '' };
|
||||
}
|
||||
|
||||
/**
|
||||
* Gibt eine benutzerfreundliche Begründung für die Kategorie zurück
|
||||
*/
|
||||
function getCategoryReason(category) {
|
||||
const reasons = {
|
||||
historical: 'Historisch belasteter Name',
|
||||
offensive: 'Beleidigender oder anstößiger Begriff',
|
||||
titles: 'Titel oder Berufsbezeichnung',
|
||||
brands: 'Markenname',
|
||||
inappropriate: 'Unpassender Begriff'
|
||||
};
|
||||
|
||||
return reasons[category] || 'Unzulässiger Begriff';
|
||||
}
|
||||
|
||||
/**
|
||||
* Fügt einen neuen Begriff zur Blacklist hinzu
|
||||
* @param {string} term - Der hinzuzufügende Begriff
|
||||
* @param {string} category - Die Kategorie
|
||||
*/
|
||||
function addToBlacklist(term, category) {
|
||||
if (BLACKLIST[category] && !BLACKLIST[category].includes(term.toLowerCase())) {
|
||||
BLACKLIST[category].push(term.toLowerCase());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Entfernt einen Begriff aus der Blacklist
|
||||
* @param {string} term - Der zu entfernende Begriff
|
||||
* @param {string} category - Die Kategorie
|
||||
*/
|
||||
function removeFromBlacklist(term, category) {
|
||||
if (BLACKLIST[category]) {
|
||||
const index = BLACKLIST[category].indexOf(term.toLowerCase());
|
||||
if (index > -1) {
|
||||
BLACKLIST[category].splice(index, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gibt die komplette Blacklist zurück (für Admin-Zwecke)
|
||||
*/
|
||||
function getBlacklist() {
|
||||
return BLACKLIST;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
checkNameAgainstBlacklist,
|
||||
addToBlacklist,
|
||||
removeFromBlacklist,
|
||||
getBlacklist,
|
||||
BLACKLIST
|
||||
};
|
||||
336
config/levenshtein.js
Normal file
336
config/levenshtein.js
Normal file
@@ -0,0 +1,336 @@
|
||||
/**
|
||||
* Levenshtein-Distanz Algorithmus für Fuzzy-Matching
|
||||
* Erkennt Abwandlungen und Tippfehler von Blacklist-Begriffen
|
||||
*/
|
||||
|
||||
/**
|
||||
* Berechnet die Levenshtein-Distanz zwischen zwei Strings
|
||||
* @param {string} str1 - Erster String
|
||||
* @param {string} str2 - Zweiter String
|
||||
* @returns {number} - Distanz (0 = identisch, höher = unterschiedlicher)
|
||||
*/
|
||||
function levenshteinDistance(str1, str2) {
|
||||
const len1 = str1.length;
|
||||
const len2 = str2.length;
|
||||
|
||||
// Erstelle Matrix
|
||||
const matrix = Array(len2 + 1).fill(null).map(() => Array(len1 + 1).fill(null));
|
||||
|
||||
// Initialisiere erste Zeile und Spalte
|
||||
for (let i = 0; i <= len1; i++) {
|
||||
matrix[0][i] = i;
|
||||
}
|
||||
for (let j = 0; j <= len2; j++) {
|
||||
matrix[j][0] = j;
|
||||
}
|
||||
|
||||
// Fülle Matrix
|
||||
for (let j = 1; j <= len2; j++) {
|
||||
for (let i = 1; i <= len1; i++) {
|
||||
const cost = str1[i - 1] === str2[j - 1] ? 0 : 1;
|
||||
matrix[j][i] = Math.min(
|
||||
matrix[j][i - 1] + 1, // Deletion
|
||||
matrix[j - 1][i] + 1, // Insertion
|
||||
matrix[j - 1][i - 1] + cost // Substitution
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return matrix[len2][len1];
|
||||
}
|
||||
|
||||
/**
|
||||
* Berechnet die normalisierte Levenshtein-Distanz (0-1)
|
||||
* @param {string} str1 - Erster String
|
||||
* @param {string} str2 - Zweiter String
|
||||
* @returns {number} - Normalisierte Distanz (0 = identisch, 1 = komplett unterschiedlich)
|
||||
*/
|
||||
function normalizedLevenshteinDistance(str1, str2) {
|
||||
const distance = levenshteinDistance(str1, str2);
|
||||
const maxLength = Math.max(str1.length, str2.length);
|
||||
return maxLength === 0 ? 0 : distance / maxLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prüft ob ein String ähnlich zu einem Blacklist-Begriff ist
|
||||
* @param {string} input - Eingabe-String
|
||||
* @param {string} blacklistTerm - Blacklist-Begriff
|
||||
* @param {number} threshold - Schwellenwert (0-1, niedriger = strenger)
|
||||
* @returns {boolean} - True wenn ähnlich genug
|
||||
*/
|
||||
function isSimilarToBlacklistTerm(input, blacklistTerm, threshold = 0.3) {
|
||||
const normalizedDistance = normalizedLevenshteinDistance(input, blacklistTerm);
|
||||
return normalizedDistance <= threshold;
|
||||
}
|
||||
|
||||
/**
|
||||
* Findet ähnliche Begriffe in einer Blacklist
|
||||
* @param {string} input - Eingabe-String
|
||||
* @param {Array} blacklistTerms - Array von Blacklist-Begriffen
|
||||
* @param {number} threshold - Schwellenwert (0-1)
|
||||
* @returns {Array} - Array von ähnlichen Begriffen mit Distanz
|
||||
*/
|
||||
function findSimilarTerms(input, blacklistTerms, threshold = 0.3) {
|
||||
const similarTerms = [];
|
||||
const normalizedInput = input.toLowerCase().trim();
|
||||
|
||||
// Performance-Optimierung: Frühe Beendigung bei sehr kurzen Strings
|
||||
if (normalizedInput.length < 2) {
|
||||
return similarTerms;
|
||||
}
|
||||
|
||||
for (const term of blacklistTerms) {
|
||||
const normalizedTerm = term.toLowerCase().trim();
|
||||
|
||||
// Performance-Optimierung: Skip bei zu großer Längendifferenz
|
||||
const lengthDiff = Math.abs(normalizedInput.length - normalizedTerm.length);
|
||||
const maxLengthDiff = Math.ceil(normalizedInput.length * threshold);
|
||||
if (lengthDiff > maxLengthDiff) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const distance = normalizedLevenshteinDistance(normalizedInput, normalizedTerm);
|
||||
if (distance <= threshold) {
|
||||
similarTerms.push({
|
||||
term: term,
|
||||
distance: distance,
|
||||
levenshteinDistance: levenshteinDistance(normalizedInput, normalizedTerm)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Sortiere nach Distanz (niedrigste zuerst)
|
||||
return similarTerms.sort((a, b) => a.distance - b.distance);
|
||||
}
|
||||
|
||||
/**
|
||||
* Erweiterte Blacklist-Prüfung mit Levenshtein-Distanz und Teilstring-Matching
|
||||
* @param {string} firstname - Vorname
|
||||
* @param {string} lastname - Nachname
|
||||
* @param {Array} blacklistTerms - Array von Blacklist-Begriffen
|
||||
* @param {number} threshold - Schwellenwert für Ähnlichkeit (0-1)
|
||||
* @returns {Object} - Prüfungsergebnis mit ähnlichen Begriffen
|
||||
*/
|
||||
function checkWithLevenshtein(firstname, lastname, blacklistTerms, threshold = 0.3) {
|
||||
const fullName = `${firstname.toLowerCase().trim()} ${lastname.toLowerCase().trim()}`;
|
||||
const firstNameOnly = firstname.toLowerCase().trim();
|
||||
const lastNameOnly = lastname.toLowerCase().trim();
|
||||
|
||||
// Prüfe alle Varianten
|
||||
const variants = [fullName, firstNameOnly, lastNameOnly];
|
||||
const allSimilarTerms = [];
|
||||
|
||||
for (const variant of variants) {
|
||||
// 1. Direkte Levenshtein-Prüfung
|
||||
const similarTerms = findSimilarTerms(variant, blacklistTerms, threshold);
|
||||
allSimilarTerms.push(...similarTerms);
|
||||
|
||||
// 2. Teilstring-Matching: Prüfe alle Wörter im Variant gegen Blacklist
|
||||
const words = variant.split(/\s+/);
|
||||
for (const word of words) {
|
||||
if (word.length >= 2) { // Nur Wörter mit mindestens 2 Zeichen
|
||||
const wordSimilarTerms = findSimilarTerms(word, blacklistTerms, threshold);
|
||||
allSimilarTerms.push(...wordSimilarTerms);
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Teilstring-Matching: Prüfe Blacklist-Begriffe gegen Variant
|
||||
for (const blacklistTerm of blacklistTerms) {
|
||||
const normalizedTerm = blacklistTerm.toLowerCase().trim();
|
||||
if (normalizedTerm.length >= 2) {
|
||||
// Prüfe ob Blacklist-Begriff als Teilstring im Variant vorkommt
|
||||
if (variant.includes(normalizedTerm)) {
|
||||
allSimilarTerms.push({
|
||||
term: blacklistTerm,
|
||||
distance: 0, // Exakte Teilstring-Übereinstimmung
|
||||
levenshteinDistance: 0,
|
||||
matchType: 'substring'
|
||||
});
|
||||
} else {
|
||||
// Prüfe Levenshtein für Teilstrings
|
||||
const words = variant.split(/\s+/);
|
||||
for (const word of words) {
|
||||
if (word.length >= 2) {
|
||||
const distance = normalizedLevenshteinDistance(word, normalizedTerm);
|
||||
if (distance <= threshold) {
|
||||
allSimilarTerms.push({
|
||||
term: blacklistTerm,
|
||||
distance: distance,
|
||||
levenshteinDistance: levenshteinDistance(word, normalizedTerm),
|
||||
matchType: 'substring-similar'
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Entferne Duplikate und sortiere nach Distanz
|
||||
const uniqueSimilarTerms = allSimilarTerms.reduce((acc, current) => {
|
||||
const existing = acc.find(item => item.term === current.term);
|
||||
if (!existing || current.distance < existing.distance) {
|
||||
return acc.filter(item => item.term !== current.term).concat(current);
|
||||
}
|
||||
return acc;
|
||||
}, []);
|
||||
|
||||
return {
|
||||
hasSimilarTerms: uniqueSimilarTerms.length > 0,
|
||||
similarTerms: uniqueSimilarTerms.sort((a, b) => a.distance - b.distance),
|
||||
bestMatch: uniqueSimilarTerms.length > 0 ? uniqueSimilarTerms[0] : null
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Konfigurierbare Schwellenwerte für verschiedene Kategorien
|
||||
*/
|
||||
const THRESHOLDS = {
|
||||
historical: 0.2, // Sehr streng für historische Begriffe
|
||||
offensive: 0.25, // Streng für beleidigende Begriffe
|
||||
titles: 0.3, // Normal für Titel
|
||||
brands: 0.35, // Etwas lockerer für Marken
|
||||
inappropriate: 0.3 // Normal für unpassende Begriffe
|
||||
};
|
||||
|
||||
/**
|
||||
* Performance-optimierte Version für große Blacklists
|
||||
* Verwendet Trigram-Index für bessere Performance
|
||||
*/
|
||||
class TrigramIndex {
|
||||
constructor() {
|
||||
this.index = new Map();
|
||||
}
|
||||
|
||||
/**
|
||||
* Erstellt Trigramme aus einem String
|
||||
* @param {string} str - Eingabe-String
|
||||
* @returns {Array} - Array von Trigrammen
|
||||
*/
|
||||
createTrigrams(str) {
|
||||
const normalized = str.toLowerCase().trim();
|
||||
const trigrams = [];
|
||||
|
||||
for (let i = 0; i < normalized.length - 2; i++) {
|
||||
trigrams.push(normalized.substring(i, i + 3));
|
||||
}
|
||||
|
||||
return trigrams;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fügt einen Begriff zum Index hinzu
|
||||
* @param {string} term - Begriff
|
||||
* @param {string} category - Kategorie
|
||||
*/
|
||||
addTerm(term, category) {
|
||||
const trigrams = this.createTrigrams(term);
|
||||
for (const trigram of trigrams) {
|
||||
if (!this.index.has(trigram)) {
|
||||
this.index.set(trigram, []);
|
||||
}
|
||||
this.index.get(trigram).push({ term, category });
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Findet Kandidaten basierend auf Trigram-Übereinstimmung
|
||||
* @param {string} input - Eingabe-String
|
||||
* @param {number} minTrigrams - Mindestanzahl übereinstimmender Trigramme
|
||||
* @returns {Array} - Array von Kandidaten
|
||||
*/
|
||||
findCandidates(input, minTrigrams = 1) {
|
||||
const inputTrigrams = this.createTrigrams(input);
|
||||
const candidateCount = new Map();
|
||||
|
||||
for (const trigram of inputTrigrams) {
|
||||
if (this.index.has(trigram)) {
|
||||
for (const candidate of this.index.get(trigram)) {
|
||||
const key = `${candidate.term}|${candidate.category}`;
|
||||
candidateCount.set(key, (candidateCount.get(key) || 0) + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Filtere Kandidaten mit mindestens minTrigrams Übereinstimmungen
|
||||
const candidates = [];
|
||||
for (const [key, count] of candidateCount) {
|
||||
if (count >= minTrigrams) {
|
||||
const [term, category] = key.split('|');
|
||||
candidates.push({ term, category });
|
||||
}
|
||||
}
|
||||
|
||||
return candidates;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Performance-optimierte Blacklist-Prüfung mit Trigram-Index
|
||||
* @param {string} firstname - Vorname
|
||||
* @param {string} lastname - Nachname
|
||||
* @param {Object} blacklist - Blacklist gruppiert nach Kategorien
|
||||
* @param {TrigramIndex} trigramIndex - Trigram-Index
|
||||
* @returns {Object} - Prüfungsergebnis
|
||||
*/
|
||||
function checkWithTrigramIndex(firstname, lastname, blacklist, trigramIndex) {
|
||||
const fullName = `${firstname.toLowerCase().trim()} ${lastname.toLowerCase().trim()}`;
|
||||
const firstNameOnly = firstname.toLowerCase().trim();
|
||||
const lastNameOnly = lastname.toLowerCase().trim();
|
||||
|
||||
const variants = [fullName, firstNameOnly, lastNameOnly];
|
||||
const allSimilarTerms = [];
|
||||
|
||||
for (const variant of variants) {
|
||||
// Finde Kandidaten mit Trigram-Index
|
||||
const candidates = trigramIndex.findCandidates(variant, 1);
|
||||
|
||||
// Prüfe nur Kandidaten mit Levenshtein
|
||||
for (const candidate of candidates) {
|
||||
const categoryTerms = blacklist[candidate.category] || [];
|
||||
const similarTerms = findSimilarTerms(variant, categoryTerms, THRESHOLDS[candidate.category] || 0.3);
|
||||
allSimilarTerms.push(...similarTerms);
|
||||
}
|
||||
}
|
||||
|
||||
// Entferne Duplikate und sortiere
|
||||
const uniqueSimilarTerms = allSimilarTerms.reduce((acc, current) => {
|
||||
const existing = acc.find(item => item.term === current.term);
|
||||
if (!existing || current.distance < existing.distance) {
|
||||
return acc.filter(item => item.term !== current.term).concat(current);
|
||||
}
|
||||
return acc;
|
||||
}, []);
|
||||
|
||||
return {
|
||||
hasSimilarTerms: uniqueSimilarTerms.length > 0,
|
||||
similarTerms: uniqueSimilarTerms.sort((a, b) => a.distance - b.distance),
|
||||
bestMatch: uniqueSimilarTerms.length > 0 ? uniqueSimilarTerms[0] : null
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Kategorie-spezifische Levenshtein-Prüfung
|
||||
* @param {string} firstname - Vorname
|
||||
* @param {string} lastname - Nachname
|
||||
* @param {Array} blacklistTerms - Array von Blacklist-Begriffen
|
||||
* @param {string} category - Kategorie der Begriffe
|
||||
* @returns {Object} - Prüfungsergebnis
|
||||
*/
|
||||
function checkWithCategoryThreshold(firstname, lastname, blacklistTerms, category) {
|
||||
const threshold = THRESHOLDS[category] || 0.3;
|
||||
return checkWithLevenshtein(firstname, lastname, blacklistTerms, threshold);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
levenshteinDistance,
|
||||
normalizedLevenshteinDistance,
|
||||
isSimilarToBlacklistTerm,
|
||||
findSimilarTerms,
|
||||
checkWithLevenshtein,
|
||||
checkWithCategoryThreshold,
|
||||
checkWithTrigramIndex,
|
||||
TrigramIndex,
|
||||
THRESHOLDS
|
||||
};
|
||||
Reference in New Issue
Block a user