dfa.php 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. <?php
  2. declare(strict_types=0);
  3. class DFAFilter
  4. {
  5. static private $stInstance = NULL;
  6. private $dict;
  7. const dictionary_path = BASE_RESOURCE_PATH . '/sensitive_word/dictionary.txt';
  8. static public function instance()
  9. {
  10. if(self::$stInstance == NULL) {
  11. self::$stInstance = new DFAFilter();
  12. }
  13. return self::$stInstance;
  14. }
  15. private function __construct()
  16. {
  17. $this->loadFile();
  18. }
  19. private function loadFile()
  20. {
  21. $this->dict = array();
  22. $this->initDict();
  23. }
  24. private function initDict()
  25. {
  26. $handle = fopen(self::dictionary_path, 'r');
  27. if (!$handle) {
  28. $path = self::dictionary_path;
  29. throw new RuntimeException("cannot open sensitive_word dictionary file ={$path}.");
  30. }
  31. while (!feof($handle)) {
  32. $word = trim(fgets($handle, 128));
  33. if (empty($word)) {
  34. continue;
  35. }
  36. $uWord = $this->unicodeSplit($word);
  37. $pdict = &$this->dict;
  38. $count = count($uWord);
  39. for ($i = 0; $i < $count; $i++) {
  40. if (!isset($pdict[$uWord[$i]])) {
  41. $pdict[$uWord[$i]] = array();
  42. }
  43. $pdict = &$pdict[$uWord[$i]];
  44. }
  45. $pdict['end'] = true;
  46. }
  47. fclose($handle);
  48. }
  49. public function filter($str, $maxDistance = 10)
  50. {
  51. if ($maxDistance < 1) {
  52. $maxDistance = 1;
  53. }
  54. $uStr = $this->unicodeSplit($str);
  55. $count = count($uStr);
  56. for ($i = 0; $i < $count; $i++) {
  57. if (isset($this->dict[$uStr[$i]])) {
  58. $pdict = &$this->dict[$uStr[$i]];
  59. $matchIndexes = array();
  60. for ($j = $i + 1, $d = 0; $d < $maxDistance && $j < $count; $j++, $d++) {
  61. if (isset($pdict[$uStr[$j]])) {
  62. $matchIndexes[] = $j;
  63. $pdict = &$pdict[$uStr[$j]];
  64. $d = -1;
  65. }
  66. }
  67. if (isset($pdict['end'])) {
  68. $uStr[$i] = '*';
  69. foreach ($matchIndexes as $k) {
  70. if ($k - $i == 1) {
  71. $i = $k;
  72. }
  73. $uStr[$k] = '*';
  74. }
  75. }
  76. }
  77. }
  78. return implode('',$uStr);
  79. }
  80. public function unicodeSplit($str)
  81. {
  82. $str = strtolower($str);
  83. $ret = array();
  84. $len = strlen($str);
  85. for ($i = 0; $i < $len; $i++) {
  86. $c = ord($str[$i]);
  87. if ($c & 0x80) {
  88. if (($c & 0xf8) == 0xf0 && $len - $i >= 4) {
  89. if ((ord($str[$i + 1]) & 0xc0) == 0x80 && (ord($str[$i + 2]) & 0xc0) == 0x80 && (ord($str[$i + 3]) & 0xc0) == 0x80) {
  90. $uc = substr($str, $i, 4);
  91. $ret[] = $uc;
  92. $i += 3;
  93. }
  94. }
  95. elseif (($c & 0xf0) == 0xe0 && $len - $i >= 3) {
  96. if ((ord($str[$i + 1]) & 0xc0) == 0x80 && (ord($str[$i + 2]) & 0xc0) == 0x80) {
  97. $uc = substr($str, $i, 3);
  98. $ret[] = $uc;
  99. $i += 2;
  100. }
  101. }
  102. elseif (($c & 0xe0) == 0xc0 && $len - $i >= 2) {
  103. if ((ord($str[$i + 1]) & 0xc0) == 0x80) {
  104. $uc = substr($str, $i, 2);
  105. $ret[] = $uc;
  106. $i += 1;
  107. }
  108. }
  109. } else {
  110. $ret[] = $str[$i];
  111. }
  112. }
  113. return $ret;
  114. }
  115. }