dfa.php 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. <?php
  2. class DFAFilter
  3. {
  4. static private $stInstance = NULL;
  5. private $dict;
  6. const dictionary_path = BASE_RESOURCE_PATH . '/sensitive_word/dictionary.txt';
  7. static public function instance()
  8. {
  9. if(self::$stInstance == NULL) {
  10. self::$stInstance = new DFAFilter();
  11. }
  12. return self::$stInstance;
  13. }
  14. private function __construct()
  15. {
  16. $this->loadFile();
  17. }
  18. private function loadFile()
  19. {
  20. $this->dict = array();
  21. $this->initDict();
  22. }
  23. private function initDict()
  24. {
  25. $handle = fopen(self::dictionary_path, 'r');
  26. if (!$handle) {
  27. $path = self::dictionary_path;
  28. throw new RuntimeException("cannot open sensitive_word dictionary file ={$path}.");
  29. }
  30. while (!feof($handle)) {
  31. $word = trim(fgets($handle, 128));
  32. if (empty($word)) {
  33. continue;
  34. }
  35. $uWord = $this->unicodeSplit($word);
  36. $pdict = &$this->dict;
  37. $count = count($uWord);
  38. for ($i = 0; $i < $count; $i++) {
  39. if (!isset($pdict[$uWord[$i]])) {
  40. $pdict[$uWord[$i]] = array();
  41. }
  42. $pdict = &$pdict[$uWord[$i]];
  43. }
  44. $pdict['end'] = true;
  45. }
  46. fclose($handle);
  47. }
  48. public function filter($str, $maxDistance = 10)
  49. {
  50. if ($maxDistance < 1) {
  51. $maxDistance = 1;
  52. }
  53. $uStr = $this->unicodeSplit($str);
  54. $count = count($uStr);
  55. for ($i = 0; $i < $count; $i++) {
  56. if (isset($this->dict[$uStr[$i]])) {
  57. $pdict = &$this->dict[$uStr[$i]];
  58. $matchIndexes = array();
  59. for ($j = $i + 1, $d = 0; $d < $maxDistance && $j < $count; $j++, $d++) {
  60. if (isset($pdict[$uStr[$j]])) {
  61. $matchIndexes[] = $j;
  62. $pdict = &$pdict[$uStr[$j]];
  63. $d = -1;
  64. }
  65. }
  66. if (isset($pdict['end'])) {
  67. $uStr[$i] = '*';
  68. foreach ($matchIndexes as $k) {
  69. if ($k - $i == 1) {
  70. $i = $k;
  71. }
  72. $uStr[$k] = '*';
  73. }
  74. }
  75. }
  76. }
  77. return implode('',$uStr);
  78. }
  79. public function unicodeSplit($str)
  80. {
  81. $str = strtolower($str);
  82. $ret = array();
  83. $len = strlen($str);
  84. for ($i = 0; $i < $len; $i++) {
  85. $c = ord($str[$i]);
  86. if ($c & 0x80) {
  87. if (($c & 0xf8) == 0xf0 && $len - $i >= 4) {
  88. if ((ord($str[$i + 1]) & 0xc0) == 0x80 && (ord($str[$i + 2]) & 0xc0) == 0x80 && (ord($str[$i + 3]) & 0xc0) == 0x80) {
  89. $uc = substr($str, $i, 4);
  90. $ret[] = $uc;
  91. $i += 3;
  92. }
  93. } else if (($c & 0xf0) == 0xe0 && $len - $i >= 3) {
  94. if ((ord($str[$i + 1]) & 0xc0) == 0x80 && (ord($str[$i + 2]) & 0xc0) == 0x80) {
  95. $uc = substr($str, $i, 3);
  96. $ret[] = $uc;
  97. $i += 2;
  98. }
  99. } else if (($c & 0xe0) == 0xc0 && $len - $i >= 2) {
  100. if ((ord($str[$i + 1]) & 0xc0) == 0x80) {
  101. $uc = substr($str, $i, 2);
  102. $ret[] = $uc;
  103. $i += 1;
  104. }
  105. }
  106. } else {
  107. $ret[] = $str[$i];
  108. }
  109. }
  110. return $ret;
  111. }
  112. }