loadFile(); } private function loadFile() { $this->dict = array(); $this->initDict(); } private function initDict() { $handle = fopen(self::dictionary_path, 'r'); if (!$handle) { $path = self::dictionary_path; throw new RuntimeException("cannot open sensitive_word dictionary file ={$path}."); } while (!feof($handle)) { $word = trim(fgets($handle, 128)); if (empty($word)) { continue; } $uWord = $this->unicodeSplit($word); $pdict = &$this->dict; $count = count($uWord); for ($i = 0; $i < $count; $i++) { if (!isset($pdict[$uWord[$i]])) { $pdict[$uWord[$i]] = array(); } $pdict = &$pdict[$uWord[$i]]; } $pdict['end'] = true; } fclose($handle); } public function filter($str, $maxDistance = 10) { if ($maxDistance < 1) { $maxDistance = 1; } $uStr = $this->unicodeSplit($str); $count = count($uStr); for ($i = 0; $i < $count; $i++) { if (isset($this->dict[$uStr[$i]])) { $pdict = &$this->dict[$uStr[$i]]; $matchIndexes = array(); for ($j = $i + 1, $d = 0; $d < $maxDistance && $j < $count; $j++, $d++) { if (isset($pdict[$uStr[$j]])) { $matchIndexes[] = $j; $pdict = &$pdict[$uStr[$j]]; $d = -1; } } if (isset($pdict['end'])) { $uStr[$i] = '*'; foreach ($matchIndexes as $k) { if ($k - $i == 1) { $i = $k; } $uStr[$k] = '*'; } } } } return implode('',$uStr); } public function unicodeSplit($str) { $str = strtolower($str); $ret = array(); $len = strlen($str); for ($i = 0; $i < $len; $i++) { $c = ord($str[$i]); if ($c & 0x80) { if (($c & 0xf8) == 0xf0 && $len - $i >= 4) { if ((ord($str[$i + 1]) & 0xc0) == 0x80 && (ord($str[$i + 2]) & 0xc0) == 0x80 && (ord($str[$i + 3]) & 0xc0) == 0x80) { $uc = substr($str, $i, 4); $ret[] = $uc; $i += 3; } } elseif (($c & 0xf0) == 0xe0 && $len - $i >= 3) { if ((ord($str[$i + 1]) & 0xc0) == 0x80 && (ord($str[$i + 2]) & 0xc0) == 0x80) { $uc = substr($str, $i, 3); $ret[] = $uc; $i += 2; } } elseif (($c & 0xe0) == 0xc0 && $len - $i >= 2) { if ((ord($str[$i + 1]) & 0xc0) == 0x80) { $uc = substr($str, $i, 2); $ret[] = $uc; $i += 1; } } } else { $ret[] = $str[$i]; } } return $ret; } }