回复
java文本分词
d_hero
发布于 2023-7-3 11:56
浏览
0收藏
本文基于ik分词器实现对文本的关键词的分词
代码结构如下
这是我自定义的springboot-starter
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>common-starter</artifactId>
<groupId>com.wlc.cloud</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>ik-analyzer-spring-boot-starter</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-configuration-processor</artifactId>
<optional>true</optional>
</dependency>
<!--ik分词器2012引入-->
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
</dependency>
<dependency>
<groupId>net.coobird</groupId>
<artifactId>thumbnailator</artifactId>
<version>0.4.8</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-core</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-autoconfigure</artifactId>
</dependency>
</dependencies>
</project>
IKAnalyzerUtil
package com.ik.analyzer.util;
import com.ik.analyzer.constant.AnalyzerModel;
import com.ik.analyzer.segmenter.Segmenter;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* 描述: 中文分词util </br>
* 时间: 2022-01-09 12:44 </br>
* 作者:IT学习道场
*/
public class IKAnalyzerUtil {
/**
* 默认分词
* 分词后生成数组
* @param text 分词文本字符串
* @return List<String> - 返回分词数组
* @throws IOException
*/
public static List<String> cut(String text) throws IOException {
//默认启动智能分词器
List<String> list = cut(text, AnalyzerModel.useSmart);
return list;
}
/**
* 分词后生成数组
* @param text 分词文本字符串
* @param model 智能分词模式
* @return List<String> - 返回分词数组
* @throws IOException
*/
public static List<String> cut(String text, AnalyzerModel model) throws IOException {
List<String> list = cutCore(text, model.isEnable());
return list;
}
/**
* 分词后生成数组
* @param text 分词文本字符串
* @param useSmart 受否启用智能分词器
* @return List<String> - 返回分词数组
* @throws IOException
*/
protected static List<String> cutCore(String text, boolean useSmart) throws IOException {
IKSegmenter ik = Segmenter.getIKSegmenter(text, useSmart);
//词元
Lexeme lex = null;
List<String> list=new ArrayList<>();
while((lex = ik.next()) != null){
//list中新增词元文本
list.add(lex.getLexemeText());
}
return list;
}
}
Segmenter
package com.ik.analyzer.segmenter;
import com.ik.analyzer.reader.IKReader;
import org.wltea.analyzer.core.IKSegmenter;
import java.io.StringReader;
/**
* 描述: 分词分节器 </br>
* 时间: 2022-05-07 11:32 </br>
* 作者:IT学习道场
*/
public class Segmenter {
/**
*
* @param text 分词文本
* @param useSmart 是否启用智能分词
* @return IKSegmenter - 分节器
*/
public static IKSegmenter getIKSegmenter(String text, boolean useSmart){
StringReader sr = IKReader.getSR(text);
IKSegmenter ik = new IKSegmenter(sr, useSmart);
return ik;
}
}
IKReader
package com.ik.analyzer.reader;
import java.io.StringReader;
/**
* 描述: 分词reader </br>
* 时间: 2022-05-07 11:36 </br>
* 作者:IT学习道场
*/
public class IKReader {
public static StringReader getSR(String text){
StringReader sr = new StringReader(text);
return sr;
}
}
IKAnalyzer
package com.ik.analyzer.ik;
import com.ik.analyzer.constant.AnalyzerModel;
import com.ik.analyzer.ex.AnalyzerException;
import com.ik.analyzer.util.IKAnalyzerUtil;
import org.springframework.util.ObjectUtils;
import java.io.IOException;
import java.util.List;
/**
* 描述: ik分词器 </br>
* 时间: 2022-05-07 10:50 </br>
* 作者:IT学习道场
*/
public class IKAnalyzer {
/**
* 默认智能分词
* @param text 分词文本
* @return List<String> - 分词数组
* @throws IOException
*/
public List<String> analyseStr(String text) throws IOException {
if (ObjectUtils.isEmpty(text)){
throw new AnalyzerException("======分词文本不能为空===========");
}
List<String> result = IKAnalyzerUtil.cut(text);
result.add(text);
return result;
}
/**
* 默认智能分词
* @param text 分词文本
* @param model 分词模式
* @return List<String> - 分词数组
* @throws IOException
*/
public List<String> analyseStr(String text, AnalyzerModel model) throws IOException {
if (ObjectUtils.isEmpty(text)){
throw new AnalyzerException("======分词文本不能为空===========");
}
List<String> result = IKAnalyzerUtil.cut(text, model);
result.add(text);
return result;
}
}
AnalyzerException
package com.ik.analyzer.ex;
/**
* 描述: todo </br>
* 时间: 2022-05-07 11:02 </br>
* 作者:IT学习道场
*/
public class AnalyzerException extends RuntimeException {
public AnalyzerException(String message) {
super(message);
}
public AnalyzerException() {
}
public AnalyzerException(String message, Throwable cause) {
super(message, cause);
}
public AnalyzerException(Throwable cause) {
super(cause);
}
public AnalyzerException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) {
super(message, cause, enableSuppression, writableStackTrace);
}
}
AnalyzerModel
package com.ik.analyzer.constant;
/**
* 描述: 分词器模式 </br>
* 时间: 2022-05-09 8:31 </br>
* 作者:IT学习道场
*/
public enum AnalyzerModel {
useSmart("useSmart","智能分词器模式",true),
unUseSmart("unUseSmart","最大限度分词模式",false),
;
/**分词模式code*/
private String code;
/**模式描述*/
private String desc;
/**启用*/
private boolean enable;
AnalyzerModel(String code, String desc, boolean enable) {
this.code = code;
this.desc = desc;
this.enable = enable;
}
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
public String getDesc() {
return desc;
}
public void setDesc(String desc) {
this.desc = desc;
}
public boolean isEnable() {
return enable;
}
public void setEnable(boolean enable) {
this.enable = enable;
}
}
IKAnalyzerAutoConfiguration
package com.ik.analyzer.config;
import com.ik.analyzer.ik.IKAnalyzer;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
/**
* 描述: ik分词配置 </br>
* 时间: 2022-05-07 11:05 </br>
* 作者:IT学习道场
*/
@Configuration
public class IKAnalyzerAutoConfiguration {
@Bean
@ConditionalOnProperty("analyzer.ik.enable")
public IKAnalyzer iKAnalyzer(){
IKAnalyzer ikAnalyzer = new IKAnalyzer();
return ikAnalyzer;
}
}
spring.factories
org.springframework.boot.autoconfigure.EnableAutoConfiguration=\
com.ik.analyzer.config.IKAnalyzerAutoConfiguration
IKAnalyzer.cfg.xml
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>IK Analyzer 扩展配置</comment>
<!--用户可以在这里配置自己的扩展字典 -->
<entry key="ext_dict">local.dic;</entry>
<!--用户可以在这里配置自己的扩展停止词字典 -->
<entry key="ext_stopwords">stop.dic;</entry>
</properties>
local.dic
中国太平
干饭人
stop.dic,暂时没给禁用关键词,所以是空的
使用就可以在需要的 项目中的pom下,引入
<dependency>
<groupId>com.wlc.cloud</groupId>
<artifactId>ik-analyzer-spring-boot-starter</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
实例:
@Slf4j
@RestController
@RequestMapping("/test")
public class TestController {
@Autowired
IKAnalyzer ikAnalyzer;
@GetMapping("/analyseStr")
public List<String> analyseStr() throws IOException {
String text = "我们是一群干饭人";
//默认是 useSmart-智能分词器模式
List<String> list = ikAnalyzer.analyseStr(text);
return list;
}
}
响应结果:
{
"code": 200,
"success": true,
"msg": "success",
"data": [
"我们",
"是",
"一群",
"干饭人",
"我们是一群干饭人"
]
}
保留了原文,如果你不需要原文,可以把原文去掉。需要修改
ikAnalyzer.analyseStr的方法
最大限度分词模式测试
@Slf4j
@RestController
@RequestMapping("/test")
public class TestController {
@Autowired
IKAnalyzer ikAnalyzer;
@GetMapping("/analyseStr")
public List<String> analyseStr() throws IOException {
String text = "我们是一群干饭人";
List<String> list = ikAnalyzer.analyseStr(text, AnalyzerModel.unUseSmart);
return list;
}
}
响应结果:
{
"code": 200,
"success": true,
"msg": "success",
"data": [
"我们",
"我",
"们",
"是",
"一群",
"一",
"群",
"干饭人",
"干饭",
"人",
"我们是一群干饭人"
]
}
自己去玩玩吧
文章转载自公众号:IT学习道场
分类
标签
已于2023-7-3 11:56:56修改
赞
收藏
回复
相关推荐