瀏覽代碼

增加敏感词过滤功能

zhouzhijian 9 年之前
父節點
當前提交
7f373b9956

文件差異過大導致無法顯示
+ 14600 - 0
data/resource/sensitive_word/dictionary.txt


+ 3 - 1
fooder.php

@@ -33,4 +33,6 @@ require_once(BASE_CORE_PATH . '/framework/libraries/validate.php');
 require_once(BASE_CORE_PATH.'/framework/libraries/resizeimage.php');
 require_once (BASE_MOBILE_PATH . '/control/log.php');
 require_once (BASE_ROOT_PATH . '/helper/performance_helper.php');
-?>
+require_once (BASE_ROOT_PATH . '/helper/sensitive_word/dfa.php');
+?>
+

+ 3 - 0
helper/fcgi_server.php

@@ -50,6 +50,9 @@ class fcgi_server
 
     public function run_looper()
     {
+        //载入敏感词词库
+        DFAFilter::instance();
+
         require_once(BASE_ROOT_PATH.'/mobile/index.php');
         Base::mobile_init();
 

+ 0 - 15
helper/sensitive/DFAFilter.php

@@ -1,15 +0,0 @@
-<?php
-
-/**
- * Created by PhpStorm.
- * User: stanley-king
- * Date: 16/5/8
- * Time: 下午1:49
- */
-
-namespace sensitive_word;
-
-class DFAFilter
-{
-
-}

+ 0 - 95
helper/sensitive/DFAItem.php

@@ -1,95 +0,0 @@
-<?php
-/**
- * Created by PhpStorm.
- * User: stanley-king
- * Date: 16/5/8
- * Time: 下午1:50
- */
-
-namespace sensitive_word;
-
-class DFAItem
-{
-    private $word = null;
-    private $sub_items = array();
-    private $is_end = 0;
-
-    public function __set($name, $value)
-    {
-        $this->$name = $value;
-    }
-
-    public function __get($name)
-    {
-        return $this->$name;
-    }
-
-    // 初始化
-    public function init($word)
-    {
-        $cnt = count($word);
-        if ($cnt <= 0) {
-            return null;
-        } else if ($cnt === 1) {
-            $this->is_end = 1;
-            $this->word = $word[0];
-            $this->sub_items = null;
-        } else {
-            $this->is_end = 0;
-            $this->word = $word[0];
-            $this->add_item(array_slice($word, 1));
-        }
-        return $this;
-    }
-
-    // 添加子节点内容
-    public function add_item($word)
-    {
-        $subitem = new DFAItem();
-        $ret = $subitem->init($word);
-        if (!is_null($ret)) {
-            array_push($this->sub_items, $subitem);
-        }
-    }
-
-    // 向链表中添加内容
-    public function addwords($word)
-    {
-        $found = false;
-        foreach ($this->sub_items as $item) {
-            if (0 == strcmp($word[0], $item->word)) {
-                $item->addwords(array_slice($word, 1));
-                $found = true;
-            }
-        }
-
-        if (!$found) {
-            $subitem = new DFAItem();
-            $subitem->init($word);
-            array_push($this->sub_items, $subitem);
-        }
-    }
-
-    // 判断关键字是否在属于此item
-    public function checkword($txt)
-    {
-        if (is_null($txt)) {
-            return false;
-        }
-        $head = mb_substr($txt, 0, 1);
-        $found = false;
-        foreach ($this->sub_items as $item) {
-
-            if (0 == strcmp($item->word, $head)) {
-                if ($item->is_end == 1) {
-                    return true;
-                } else {
-                    return $item->checkword(mb_substr($txt, 1));
-                }
-            }
-        }
-        if (!$found) {
-            return false;
-        }
-    }
-}

+ 0 - 64
helper/sensitive/SensitiveWordInit.php

@@ -1,64 +0,0 @@
-<?php
-
-/**
- * 初始化敏感词库<br>
- * 将敏感词加入到HashMap中<br>
- * 构建DFA算法模型
- *
- * @author dxm
- *
- */
-
-namespace sensitive_word;
-
-require_once(BASE_ROOT_PATH . "/helper/sensitive/DFAItem.php");
-
-class SensitiveWordInit
-{
-    // 字符编码
-    const ENCODING = "UTF-8";
-
-    /**
-     * 初始化敏感字库
-     *
-     * @return
-     */
-    public function initKeyWord()
-    {
-        $word_array = $this->readSensitiveWordFile();
-        return $this->addSensitiveWordToHashMap($word_array);
-    }
-
-    /**
-     * 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:<br>
-     * 中 = { isEnd = 0 国 = {<br>
-     * isEnd = 1 人 = {isEnd = 0 民 = {isEnd = 1} } 男 = { isEnd = 0 人 = { isEnd =
-     * 1 } } } } 五 = { isEnd = 0 星 = { isEnd = 0 红 = { isEnd = 0 旗 = { isEnd = 1
-     * } } } }
-     *
-     */
-    public function addSensitiveWordToHashMap($words)
-    {
-        $dfa = new DFAItem();
-        foreach ($words as $word) {
-            $dfa->addwords(explode(" ",$word));
-        }
-        return $dfa;
-    }
-
-    /**
-     * 读取敏感词库中的内容,将内容添加到array中
-     *
-     * @return
-     * @throws Exception
-     */
-    private function readSensitiveWordFile()
-    {
-        $word_array = array();
-        array_push($word_array, '中 国');
-        array_push($word_array, '中 央');
-        array_push($word_array, '国 家');
-        array_push($word_array, '他 妈 的');
-        return $word_array;
-    }
-}

+ 0 - 112
helper/sensitive/SensitivewordFilter.php

@@ -1,112 +0,0 @@
-<?php
-
-/**
- * 敏感词过滤
- *
- * @author dxm
- *
- */
-
-namespace sensitive_word;
-
-require_once(BASE_ROOT_PATH . "/helper/sensitive/DFAItem.php");
-
-class SensitivewordFilter
-{
-
-    private $dfa = null;
-
-    // 最小匹配规则
-    public static $minMatchTYpe = 1;
-
-    // 最大匹配规则
-    public static $maxMatchType = 2;
-
-    // 单例
-    private static $inst = null;
-
-    /**
-     * 构造函数,初始化敏感词库
-     */
-    function __construct()
-    {
-    }
-
-
-    /**
-     * 获取单例
-     *
-     * @return
-     */
-    public static function getInstance()
-    {
-        if (null == self::$inst) {
-            self::$inst = new SensitivewordFilter();
-            self::$inst->dfa = (new \sensitive_word\SensitiveWordInit())->initKeyWord();
-        }
-        return self::$inst;
-    }
-
-    /**
-     * 判断文字是否包含敏感字符
-     *
-     * @param txt
-     * @param matchType
-     * @return
-     */
-    public function isContaintSensitiveWord($txt)
-    {
-        return $this->dfa->checkword($txt);
-    }
-
-    /**
-     * 获取文字中的敏感词
-     *
-     * @param txt
-     * @param matchType
-     * @return
-     */
-    public function getSensitiveWord($txt, $matchType)
-    {
-
-    }
-
-    /**
-     * 替换敏感字字符
-     *
-     * @param txt
-     * @param matchType
-     * @param replaceChar
-     * @return
-     */
-    public function  replaceSensitiveWord($txt, $matchType, $replaceChar)
-    {
-
-    }
-
-    /**
-     * 获取替换字符串
-     *
-     * @param replaceChar
-     * @param length
-     * @return
-     */
-    private function getReplaceChars($replaceChar, $length)
-    {
-    }
-
-    /**
-     * 检查文字中是否包含敏感字符,检查规则如下:<br>
-     * 如果存在,则返回敏感词字符的长度,不存在返回0
-     *
-     * @param txt
-     * @param beginIndex
-     * @param matchType
-     * @return
-     */
-    public function CheckSensitiveWord($txt, $beginIndex, $matchType)
-    {
-
-    }
-
-}

+ 134 - 0
helper/sensitive_word/dfa.php

@@ -0,0 +1,134 @@
+<?php
+
+class DFAFilter
+{
+    static private $stInstance = NULL;
+    private $dict;
+    const dictionary_path = BASE_RESOURCE_PATH . '/sensitive_word/dictionary.txt';
+	
+    static public function instance()
+    {
+        if(self::$stInstance == NULL) {
+            self::$stInstance = new DFAFilter();
+        }
+        return self::$stInstance;
+    }
+	
+    private function __construct()
+    {
+        $this->loadFile();
+    }
+
+    private function loadFile($dictPath)
+    {
+        $this->dict = array();
+        $this->initDict();
+    }
+
+    private function initDict()
+    {
+        $handle = fopen(self::dictionary_path, 'r');
+        if (!$handle) {
+            $path = self::dictionary_path;
+            throw new RuntimeException("cannot open sensitive_word dictionary file ={$path}.");
+        }
+
+        while (!feof($handle)) {
+            $word = trim(fgets($handle, 128));
+
+            if (empty($word)) {
+                continue;
+            }
+
+            $uWord = $this->unicodeSplit($word);
+            $pdict = &$this->dict;
+
+            $count = count($uWord);
+            for ($i = 0; $i < $count; $i++) {
+                if (!isset($pdict[$uWord[$i]])) {
+                    $pdict[$uWord[$i]] = array();
+                }
+                $pdict = &$pdict[$uWord[$i]];
+            }
+
+            $pdict['end'] = true;
+        }
+
+        fclose($handle);
+    }
+
+    public function filter($str, $maxDistance = 5)
+    {
+        if ($maxDistance < 1) {
+            $maxDistance = 1;
+        }
+
+        $uStr = $this->unicodeSplit($str);
+        $count = count($uStr);
+
+        for ($i = 0; $i < $count; $i++) {
+            if (isset($this->dict[$uStr[$i]])) {
+                $pdict = &$this->dict[$uStr[$i]];
+
+                $matchIndexes = array();
+                for ($j = $i + 1, $d = 0; $d < $maxDistance && $j < $count; $j++, $d++) {
+                    if (isset($pdict[$uStr[$j]])) {
+                        $matchIndexes[] = $j;
+                        $pdict = &$pdict[$uStr[$j]];
+                        $d = -1;
+                    }
+                }
+
+                if (isset($pdict['end'])) {
+                    $uStr[$i] = '*';
+                    foreach ($matchIndexes as $k) {
+                        if ($k - $i == 1) {
+                            $i = $k;
+                        }
+                        $uStr[$k] = '*';
+                    }
+                }
+            }
+        }
+
+        return implode('',$uStr);
+    }
+
+    public function unicodeSplit($str)
+    {
+        $str = strtolower($str);
+        $ret = array();
+        $len = strlen($str);
+        for ($i = 0; $i < $len; $i++) {
+            $c = ord($str[$i]);
+
+            if ($c & 0x80) {
+                if (($c & 0xf8) == 0xf0 && $len - $i >= 4) {
+                    if ((ord($str[$i + 1]) & 0xc0) == 0x80 && (ord($str[$i + 2]) & 0xc0) == 0x80 && (ord($str[$i + 3]) & 0xc0) == 0x80) {
+                        $uc = substr($str, $i, 4);
+                        $ret[] = $uc;
+                        $i += 3;
+                    }
+                } else if (($c & 0xf0) == 0xe0 && $len - $i >= 3) {
+                    if ((ord($str[$i + 1]) & 0xc0) == 0x80 && (ord($str[$i + 2]) & 0xc0) == 0x80) {
+                        $uc = substr($str, $i, 3);
+                        $ret[] = $uc;
+                        $i += 2;
+                    }
+                } else if (($c & 0xe0) == 0xc0 && $len - $i >= 2) {
+                    if ((ord($str[$i + 1])  & 0xc0) == 0x80) {
+                        $uc = substr($str, $i, 2);
+                        $ret[] = $uc;
+                        $i += 1;
+                    }
+                }
+            } else {
+                $ret[] = $str[$i];
+            }
+        }
+
+        return $ret;
+    }
+}
+
+

+ 8 - 0
helper/text_filter.php

@@ -21,9 +21,17 @@ class text_filter
 
         return $input;
     }
+
     static public function filter_input($input)
     {
         $input = self::filter_html($input);
         return $input;
     }
+
+    //过滤敏感词
+    static public function filter_sensitive_word($input)
+    {
+        $input = self::filter_html($input);
+        return DFAFilter::instance()->filter($input);
+    }
 }