【从0-1 千万级直播项目实战】组员直呼这样写太优雅了 | 敏感词过滤与脱敏
业务背景
主要为IM私聊、直播间弹幕、公屏、大厅广播消息实现一个敏感词过滤和脱敏处理
需求分析与拆解
- 敏感词库定义与导入
- 敏感词匹配DFA算法实现
- 黑、白名单词库定义
实现思路
- 封装成无侵入式组件,方便组内人员复用
- 使用时支持选择过滤或脱敏处理方式
- 利用Spring EL表达式动态获取/改变bean属性值
实现过程
黑白名单词库导入
每行一个敏感词,词库可网上寻找或者自定义
词库工具类封装
public class SensitiveWordUtil {
/**
* 词库上下文环境
*/
public static final WordContext CONTENT = new WordContext();
public static final WordFilter WORD_FILTER = new WordFilter(CONTENT);
}
public class WordContext {
/**
* 敏感词字典
*/
private final Map wordMap = new HashMap(1024);
/**
* 是否已初始化
*/
private boolean init;
/**
* 黑名单列表
*/
private final String blackList;
/**
* 白名单列表
*/
private final String whiteList;
public WordContext() {
this.blackList = "/blacklist.txt";
this.whiteList = "/whitelist.txt";
initKeyWord();
}
public WordContext(String blackList, String whiteList) {
this.blackList = blackList;
this.whiteList = whiteList;
initKeyWord();
}
/**
* 获取初始化的敏感词列表
*
* @return 敏感词列表
*/
public Map getWordMap() {
return wordMap;
}
/**
* 初始化
*/
private synchronized void initKeyWord() {
try {
if (!init) {
// 将敏感词库加入到HashMap中
addWord(readWordFile(blackList), WordType.BLACK);
// 将非敏感词库也加入到HashMap中
addWord(readWordFile(whiteList), WordType.WHITE);
}
init = true;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
/**
* 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:<br>
* 中 = { isEnd = 0 国 = {<br>
* isEnd = 1 人 = {isEnd = 0 民 = {isEnd = 1} } 男 = { isEnd = 0 人 = { isEnd = 1 }
* } } } 五 = { isEnd = 0 星 = { isEnd = 0 红 = { isEnd = 0 旗 = { isEnd = 1 } } } }
*/
public void addWord(Iterable<String> wordList, WordType wordType) {
Map nowMap;
Map<String, String> newWorMap;
// 迭代keyWordSet
for (String key : wordList) {
nowMap = wordMap;
for (int i = 0; i < key.length(); i++) {
// 转换成char型
char keyChar = key.charAt(i);
// 获取
Object wordMap = nowMap.get(keyChar);
// 如果存在该key,直接赋值
if (wordMap != null) {
nowMap = (Map) wordMap;
} else {
// 不存在则构建一个map,同时将isEnd设置为0,因为他不是最后一个
newWorMap = new HashMap<>(4);
// 不是最后一个
newWorMap.put("isEnd", String.valueOf(EndType.HAS_NEXT.ordinal()));
nowMap.put(keyChar, newWorMap);
nowMap = newWorMap;
}
if (i == key.length() - 1) {
// 最后一个
nowMap.put("isEnd", String.valueOf(EndType.IS_END.ordinal()));
nowMap.put("isWhiteWord", String.valueOf(wordType.ordinal()));
}
}
}
}
/**
* 在线删除敏感词
*
* @param wordList 敏感词列表
* @param wordType 黑名单 BLACk,白名单WHITE
*/
public void removeWord(Iterable<String> wordList, WordType wordType) {
Map nowMap;
for (String key : wordList) {
List<Map> cacheList = new ArrayList<>();
nowMap = wordMap;
for (int i = 0; i < key.length(); i++) {
char keyChar = key.charAt(i);
Object map = nowMap.get(keyChar);
if (map != null) {
nowMap = (Map) map;
cacheList.add(nowMap);
} else {
return;
}
if (i == key.length() - 1) {
char[] keys = key.toCharArray();
boolean cleanable = false;
char lastChar = 0;
for (int j = cacheList.size() - 1; j >= 0; j--) {
Map cacheMap = cacheList.get(j);
if (j == cacheList.size() - 1) {
if (String.valueOf(WordType.BLACK.ordinal()).equals(cacheMap.get("isWhiteWord"))) {
if (wordType == WordType.WHITE) {
return;
}
}
if (String.valueOf(WordType.WHITE.ordinal()).equals(cacheMap.get("isWhiteWord"))) {
if (wordType == WordType.BLACK) {
return;
}
}
cacheMap.remove("isWhiteWord");
cacheMap.remove("isEnd");
if (cacheMap.size() == 0) {
cleanable = true;
continue;
}
}
if (cleanable) {
Object isEnd = cacheMap.get("isEnd");
if (String.valueOf(EndType.IS_END.ordinal()).equals(isEnd)) {
cleanable = false;
}
cacheMap.remove(lastChar);
}
lastChar = keys[j];
}
if (cleanable) {
wordMap.remove(lastChar);
}
}
}
}
}
/**
* 读取敏感词库中的内容,将内容添加到set集合中
*/
private Set<String> readWordFile(String file) throws Exception {
Set<String> set;
// 字符编码
String encoding = "UTF-8";
try (InputStreamReader read = new InputStreamReader(
this.getClass().getResourceAsStream(file), encoding)) {
set = new HashSet<>();
BufferedReader bufferedReader = new BufferedReader(read);
String txt;
// 读取文件,将文件内容放入到set中
while ((txt = bufferedReader.readLine()) != null) {
set.add(txt);
}
}
// 关闭文件流
return set;
}
public class WordFilter {
/**
* 敏感词表
*/
private final Map wordMap;
/**
* 构造函数
*/
public WordFilter(WordContext context) {
this.wordMap = context.getWordMap();
}
/**
* 替换敏感词
*
* @param text 输入文本
*/
public String replace(final String text) {
return replace(text, 0, '*');
}
/**
* 替换敏感词
*
* @param text 输入文本
* @param symbol 替换符号
*/
public String replace(final String text, final char symbol) {
return replace(text, 0, symbol);
}
/**
* 替换敏感词
*
* @param text 输入文本
* @param skip 文本距离
* @param symbol 替换符号
*/
public String replace(final String text, final int skip, final char symbol) {
char[] charset = text.toCharArray();
for (int i = 0; i < charset.length; i++) {
FlagIndex fi = getFlagIndex(charset, i, skip);
if (fi.isFlag()) {
if (!fi.isWhiteWord()) {
for (int j : fi.getIndex()) {
charset[j] = symbol;
}
} else {
i += fi.getIndex().size() - 1;
}
}
}
return new String(charset);
}
/**
* 是否包含敏感词
*
* @param text 输入文本
*/
public boolean include(final String text) {
return include(text, 0);
}
/**
* 是否包含敏感词
*
* @param text 输入文本
* @param skip 文本距离
*/
public boolean include(final String text, final int skip) {
boolean include = false;
char[] charset = text.toCharArray();
for (int i = 0; i < charset.length; i++) {
FlagIndex fi = getFlagIndex(charset, i, skip);
if(fi.isFlag()) {
if (fi.isWhiteWord()) {
i += fi.getIndex().size() - 1;
} else {
include = true;
break;
}
}
}
return include;
}
/**
* 获取敏感词数量
*
* @param text 输入文本
*/
public int wordCount(final String text) {
return wordCount(text, 0);
}
/**
* 获取敏感词数量
*
* @param text 输入文本
* @param skip 文本距离
*/
public int wordCount(final String text, final int skip) {
int count = 0;
char[] charset = text.toCharArray();
for (int i = 0; i < charset.length; i++) {
FlagIndex fi = getFlagIndex(charset, i, skip);
if (fi.isFlag()) {
if(fi.isWhiteWord()) {
i += fi.getIndex().size() - 1;
} else {
count++;
}
}
}
return count;
}
/**
* 获取敏感词列表
*
* @param text 输入文本
*/
public List<String> wordList(final String text) {
return wordList(text, 0);
}
/**
* 获取敏感词列表
*
* @param text 输入文本
* @param skip 文本距离
*/
public List<String> wordList(final String text, final int skip) {
List<String> wordList = new ArrayList<>();
char[] charset = text.toCharArray();
for (int i = 0; i < charset.length; i++) {
FlagIndex fi = getFlagIndex(charset, i, skip);
if (fi.isFlag()) {
if(fi.isWhiteWord()) {
i += fi.getIndex().size() - 1;
} else {
StringBuilder builder = new StringBuilder();
for (int j : fi.getIndex()) {
char word = text.charAt(j);
builder.append(word);
}
wordList.add(builder.toString());
}
}
}
return wordList;
}
/**
* 获取标记索引
*
* @param charset 输入文本
* @param begin 检测起始
* @param skip 文本距离
*/
private FlagIndex getFlagIndex(final char[] charset, final int begin, final int skip) {
FlagIndex fi = new FlagIndex();
Map current = wordMap;
boolean flag = false;
int count = 0;
List<Integer> index = new ArrayList<>();
for (int i = begin; i < charset.length; i++) {
char word = charset[i];
Map mapTree = (Map) current.get(word);
if (count > skip || (i == begin && Objects.isNull(mapTree))) {
break;
}
if (Objects.nonNull(mapTree)) {
current = mapTree;
count = 0;
index.add(i);
} else {
count++;
if (flag && count > skip) {
break;
}
}
if ("1".equals(current.get("isEnd"))) {
flag = true;
}
if ("1".equals(current.get("isWhiteWord"))) {
fi.setWhiteWord(true);
break;
}
}
fi.setFlag(flag);
fi.setIndex(index);
return fi;
}
}
注意:此源代码出自 gitee.com/humingzhang… 其余的代码我不在此贴出,大家有兴趣可以自己去看
敏感词注解与方法拦截实现
@Retention(value = RetentionPolicy.RUNTIME)
@Target(value = {ElementType.METHOD})
public @interface SensitiveWordFilter {
/**
* 内容
*
* @return
*/
String[] content();
/**
* 过滤类型
*
* @return
*/
SensitiveWordFilterType filterType() default SensitiveWordFilterType.FILTER;
}
@AllArgsConstructor
public enum SensitiveWordFilterType {
/**
* 过滤
*/
FILTER,
/**
* 替换/脱敏
*/
REPLACE,
;
}
@Slf4j
public class SensitiveWordInterceptor implements MethodInterceptor {
private static final ParameterNameDiscoverer NAME_DISCOVERER = new DefaultParameterNameDiscoverer();
private static final ExpressionParser PARSER = new SpelExpressionParser();
private BeanResolver beanResolver;
public SensitiveWordInterceptor(BeanFactory beanFactory) {
this.beanResolver = new BeanFactoryResolver(beanFactory);
}
@Override
public Object invoke(MethodInvocation invocation) throws Throwable {
Class<?> cls = AopProxyUtils.ultimateTargetClass(invocation.getThis());
if (!cls.equals(invocation.getThis().getClass())) {
return invocation.proceed();
}
SensitiveWordFilter sensitiveWordFilter = invocation.getMethod().getAnnotation(SensitiveWordFilter.class);
StandardEvaluationContext context = new MethodBasedEvaluationContext(null, invocation.getMethod(),
invocation.getArguments(), NAME_DISCOVERER);
context.setBeanResolver(beanResolver);
String[] contentKeys = sensitiveWordFilter.content();
if (StringUtils.isEmpty(contentKeys)) {
log.warn("过滤内容为空.");
return invocation.proceed();
}
for (String key : contentKeys) {
String content = PARSER.parseExpression(key).getValue(context, String.class);
if (StringUtils.isBlank(content)) {
continue;
}
boolean include = SensitiveWordUtil.WORD_FILTER.include(StringUtils.deleteWhitespace(content));
if (sensitiveWordFilter.filterType().equals(SensitiveWordFilterType.FILTER)) {
if (include) {
log.error("内容包含敏感词,抛出异常 | key:{} | content:{}", key, content);
throw new SensitiveWordException(SensitiveWordCode.CONTAINS_SENSITIVE_WORD);
}
} else if (sensitiveWordFilter.filterType().equals(SensitiveWordFilterType.REPLACE)) {
if (include) {
PARSER.parseExpression(key).setValue(context, SensitiveWordUtil.WORD_FILTER.replace(StringUtils.deleteWhitespace(content)));
log.error("内容包含敏感词,已脱敏处理 | key:{} | content:{}", key, content);
}
}
}
return invocation.proceed();
}
}
public class SensitiveWordAnnotationAdvisor extends AbstractPointcutAdvisor implements BeanFactoryAware {
private final Advice advice;
private final Pointcut pointcut = AnnotationMatchingPointcut.forMethodAnnotation(SensitiveWordFilter.class);
public SensitiveWordAnnotationAdvisor(@NonNull SensitiveWordInterceptor sensitiveWordInterceptor, int order) {
this.advice = sensitiveWordInterceptor;
setOrder(order);
}
@Override
public Pointcut getPointcut() {
return this.pointcut;
}
@Override
public Advice getAdvice() {
return this.advice;
}
@Override
public void setBeanFactory(BeanFactory beanFactory) throws BeansException {
if (this.advice instanceof BeanFactoryAware) {
((BeanFactoryAware) this.advice).setBeanFactory(beanFactory);
}
}
}
@Configuration
public class SensitiveWordFilterAutoConfiguration {
@Bean
@ConditionalOnMissingBean
public SensitiveWordInterceptor sensitiveWordInterceptor(BeanFactory beanFactory) {
return new SensitiveWordInterceptor(beanFactory);
}
@Bean
@ConditionalOnMissingBean
public SensitiveWordAnnotationAdvisor sensitiveWordAnnotationAdvisor(SensitiveWordInterceptor sensitiveWordInterceptor) {
return new SensitiveWordAnnotationAdvisor(sensitiveWordInterceptor, Ordered.LOWEST_PRECEDENCE);
}
}
OK代码已完成,接下来我们看下实际使用够不够优雅和直观
过滤抛出异常
@SensitiveWordFilter(content = {"#bo.name", "#bo.intro"}, filterType = SensitiveWordFilterType.FILTER)
public void update(LiveRoomUpdateBo bo) {
}
脱敏返回数据
@SensitiveWordFilter(content = {"#bo.name", "#bo.intro"}, filterType = SensitiveWordFilterType.REPLACE)
public void update(LiveRoomUpdateBo bo) {
}
总结
由于这种方式非常优雅和无侵入式,基于这种方式,可以扩展很多的其他用法,比如权限判断、分布式锁等
但是这种Aop代理方法拦截方式也不是用得越多越好,过多的反射操作势必会影响接口性能。
转载自:https://juejin.cn/post/7240458275270557754