浏览代码

sensitive words

liax 9 年之前
父节点
当前提交
b783c79950
共有 3 个文件被更改,包括 256 次插入2 次删除
  1. 80 2
      helper/sensitive/DFAItem.php
  2. 64 0
      helper/sensitive/SensitiveWordInit.php
  3. 112 0
      helper/sensitive/SensitivewordFilter.php

+ 80 - 2
helper/sensitive/DFAItem.php

@@ -8,10 +8,88 @@
 
 namespace sensitive_word;
 
-
 class DFAItem
 {
-    private $word = array();
+    private $word = null;
+    private $sub_items = array();
+    private $is_end = 0;
+
+    public function __set($name, $value)
+    {
+        $this->$name = $value;
+    }
+
+    public function __get($name)
+    {
+        return $this->$name;
+    }
+
+    // 初始化
+    public function init($word)
+    {
+        $cnt = count($word);
+        if ($cnt <= 0) {
+            return null;
+        } else if ($cnt === 1) {
+            $this->is_end = 1;
+            $this->word = $word[0];
+            $this->sub_items = null;
+        } else {
+            $this->is_end = 0;
+            $this->word = $word[0];
+            $this->add_item(array_slice($word, 1));
+        }
+        return $this;
+    }
+
+    // 添加子节点内容
+    public function add_item($word)
+    {
+        $subitem = new DFAItem();
+        $ret = $subitem->init($word);
+        if (!is_null($ret)) {
+            array_push($this->sub_items, $subitem);
+        }
+    }
+
+    // 向链表中添加内容
+    public function addwords($word)
+    {
+        $found = false;
+        foreach ($this->sub_items as $item) {
+            if (0 == strcmp($word[0], $item->word)) {
+                $item->addwords(array_slice($word, 1));
+                $found = true;
+            }
+        }
+
+        if (!$found) {
+            $subitem = new DFAItem();
+            $subitem->init($word);
+            array_push($this->sub_items, $subitem);
+        }
+    }
 
+    // 判断关键字是否在属于此item
+    public function checkword($txt)
+    {
+        if (is_null($txt)) {
+            return false;
+        }
+        $head = mb_substr($txt, 0, 1);
+        $found = false;
+        foreach ($this->sub_items as $item) {
 
+            if (0 == strcmp($item->word, $head)) {
+                if ($item->is_end == 1) {
+                    return true;
+                } else {
+                    return $item->checkword(mb_substr($txt, 1));
+                }
+            }
+        }
+        if (!$found) {
+            return false;
+        }
+    }
 }

+ 64 - 0
helper/sensitive/SensitiveWordInit.php

@@ -0,0 +1,64 @@
+<?php
+
+/**
+ * 初始化敏感词库<br>
+ * 将敏感词加入到HashMap中<br>
+ * 构建DFA算法模型
+ *
+ * @author dxm
+ *
+ */
+
+namespace sensitive_word;
+
+require_once(BASE_ROOT_PATH . "/helper/sensitive/DFAItem.php");
+
+class SensitiveWordInit
+{
+    // 字符编码
+    const ENCODING = "UTF-8";
+
+    /**
+     * 初始化敏感字库
+     *
+     * @return
+     */
+    public function initKeyWord()
+    {
+        $word_array = $this->readSensitiveWordFile();
+        return $this->addSensitiveWordToHashMap($word_array);
+    }
+
+    /**
+     * 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:<br>
+     * 中 = { isEnd = 0 国 = {<br>
+     * isEnd = 1 人 = {isEnd = 0 民 = {isEnd = 1} } 男 = { isEnd = 0 人 = { isEnd =
+     * 1 } } } } 五 = { isEnd = 0 星 = { isEnd = 0 红 = { isEnd = 0 旗 = { isEnd = 1
+     * } } } }
+     *
+     */
+    public function addSensitiveWordToHashMap($words)
+    {
+        $dfa = new DFAItem();
+        foreach ($words as $word) {
+            $dfa->addwords(explode(" ",$word));
+        }
+        return $dfa;
+    }
+
+    /**
+     * 读取敏感词库中的内容,将内容添加到array中
+     *
+     * @return
+     * @throws Exception
+     */
+    private function readSensitiveWordFile()
+    {
+        $word_array = array();
+        array_push($word_array, '中 国');
+        array_push($word_array, '中 央');
+        array_push($word_array, '国 家');
+        array_push($word_array, '他 妈 的');
+        return $word_array;
+    }
+}

+ 112 - 0
helper/sensitive/SensitivewordFilter.php

@@ -0,0 +1,112 @@
+<?php
+
+/**
+ * 敏感词过滤
+ *
+ * @author dxm
+ *
+ */
+
+namespace sensitive_word;
+
+require_once(BASE_ROOT_PATH . "/helper/sensitive/DFAItem.php");
+
+class SensitivewordFilter
+{
+
+    private $dfa = null;
+
+    // 最小匹配规则
+    public static $minMatchTYpe = 1;
+
+    // 最大匹配规则
+    public static $maxMatchType = 2;
+
+    // 单例
+    private static $inst = null;
+
+    /**
+     * 构造函数,初始化敏感词库
+     */
+    function __construct()
+    {
+    }
+
+
+    /**
+     * 获取单例
+     *
+     * @return
+     */
+    public static function getInstance()
+    {
+        if (null == self::$inst) {
+            self::$inst = new SensitivewordFilter();
+            self::$inst->dfa = (new \sensitive_word\SensitiveWordInit())->initKeyWord();
+        }
+        return self::$inst;
+    }
+
+    /**
+     * 判断文字是否包含敏感字符
+     *
+     * @param txt
+     * @param matchType
+     * @return
+     */
+    public function isContaintSensitiveWord($txt)
+    {
+        return $this->dfa->checkword($txt);
+    }
+
+    /**
+     * 获取文字中的敏感词
+     *
+     * @param txt
+     * @param matchType
+     * @return
+     */
+    public function getSensitiveWord($txt, $matchType)
+    {
+
+    }
+
+    /**
+     * 替换敏感字字符
+     *
+     * @param txt
+     * @param matchType
+     * @param replaceChar
+     * @return
+     */
+    public function  replaceSensitiveWord($txt, $matchType, $replaceChar)
+    {
+
+    }
+
+    /**
+     * 获取替换字符串
+     *
+     * @param replaceChar
+     * @param length
+     * @return
+     */
+    private function getReplaceChars($replaceChar, $length)
+    {
+    }
+
+    /**
+     * 检查文字中是否包含敏感字符,检查规则如下:<br>
+     * 如果存在,则返回敏感词字符的长度,不存在返回0
+     *
+     * @param txt
+     * @param beginIndex
+     * @param matchType
+     * @return
+     */
+    public function CheckSensitiveWord($txt, $beginIndex, $matchType)
+    {
+
+    }
+
+}