java文本分词

d_hero
发布于 2023-7-3 11:56
浏览
0收藏

本文基于ik分词器实现对文本的关键词的分词

代码结构如下

java文本分词-鸿蒙开发者社区

这是我自定义的springboot-starter

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <parent>
        <artifactId>common-starter</artifactId>
        <groupId>com.wlc.cloud</groupId>
        <version>1.0-SNAPSHOT</version>
    </parent>
    <modelVersion>4.0.0</modelVersion>

    <artifactId>ik-analyzer-spring-boot-starter</artifactId>
    <version>1.0-SNAPSHOT</version>
    <dependencies>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-configuration-processor</artifactId>
            <optional>true</optional>
        </dependency>
        <!--ik分词器2012引入-->
        <dependency>
            <groupId>com.janeluo</groupId>
            <artifactId>ikanalyzer</artifactId>
            <version>2012_u6</version>
        </dependency>
        <dependency>
            <groupId>net.coobird</groupId>
            <artifactId>thumbnailator</artifactId>
            <version>0.4.8</version>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-core</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-autoconfigure</artifactId>
        </dependency>
    </dependencies>

</project>

IKAnalyzerUtil

package com.ik.analyzer.util;

import com.ik.analyzer.constant.AnalyzerModel;
import com.ik.analyzer.segmenter.Segmenter;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * 描述: 中文分词util </br>
 * 时间: 2022-01-09 12:44  </br>
 * 作者:IT学习道场
 */
public class IKAnalyzerUtil {
    /**
     * 默认分词
     * 分词后生成数组
     * @param text 分词文本字符串
     * @return List<String> - 返回分词数组
     * @throws IOException
     */
    public static List<String> cut(String text) throws IOException {
        //默认启动智能分词器
        List<String> list = cut(text, AnalyzerModel.useSmart);
        return list;
    }

    /**
     * 分词后生成数组
     * @param text 分词文本字符串
     * @param model 智能分词模式
     * @return List<String> - 返回分词数组
     * @throws IOException
     */
    public static List<String> cut(String text, AnalyzerModel model) throws IOException {
        List<String> list = cutCore(text, model.isEnable());
        return list;
    }


    /**
     * 分词后生成数组
     * @param text 分词文本字符串
     * @param useSmart 受否启用智能分词器
     * @return List<String> - 返回分词数组
     * @throws IOException
     */
    protected static List<String> cutCore(String text, boolean useSmart) throws IOException {
        IKSegmenter ik = Segmenter.getIKSegmenter(text, useSmart);
        //词元
        Lexeme lex = null;
        List<String> list=new ArrayList<>();
        while((lex = ik.next()) != null){
            //list中新增词元文本
            list.add(lex.getLexemeText());
        }
        return list;
    }

}

Segmenter

package com.ik.analyzer.segmenter;

import com.ik.analyzer.reader.IKReader;
import org.wltea.analyzer.core.IKSegmenter;
import java.io.StringReader;

/**
 * 描述: 分词分节器 </br>
 * 时间: 2022-05-07 11:32  </br>
 * 作者:IT学习道场
 */
public class Segmenter {
    /**
     *
     * @param text 分词文本
     * @param useSmart 是否启用智能分词
     * @return IKSegmenter - 分节器
     */
    public static IKSegmenter getIKSegmenter(String text, boolean useSmart){
        StringReader sr = IKReader.getSR(text);
        IKSegmenter ik = new IKSegmenter(sr, useSmart);
        return ik;
    }

}

IKReader

package com.ik.analyzer.reader;

import java.io.StringReader;

/**
 * 描述: 分词reader </br>
 * 时间: 2022-05-07 11:36  </br>
 * 作者:IT学习道场
 */
public class IKReader {

    public static StringReader getSR(String text){
        StringReader sr = new StringReader(text);
        return sr;
    }
}

IKAnalyzer

package com.ik.analyzer.ik;

import com.ik.analyzer.constant.AnalyzerModel;
import com.ik.analyzer.ex.AnalyzerException;
import com.ik.analyzer.util.IKAnalyzerUtil;
import org.springframework.util.ObjectUtils;

import java.io.IOException;
import java.util.List;

/**
 * 描述: ik分词器 </br>
 * 时间: 2022-05-07 10:50  </br>
 * 作者:IT学习道场
 */
public class IKAnalyzer {
    /**
     * 默认智能分词
     * @param text 分词文本
     * @return List<String> - 分词数组
     * @throws IOException
     */
    public List<String> analyseStr(String text) throws IOException {
        if (ObjectUtils.isEmpty(text)){
            throw new AnalyzerException("======分词文本不能为空===========");
        }
        List<String> result = IKAnalyzerUtil.cut(text);
        result.add(text);
        return result;
    }

    /**
     * 默认智能分词
     * @param text 分词文本
     * @param model 分词模式
     * @return List<String> - 分词数组
     * @throws IOException
     */
    public List<String> analyseStr(String text, AnalyzerModel model) throws IOException {
        if (ObjectUtils.isEmpty(text)){
            throw new AnalyzerException("======分词文本不能为空===========");
        }
        List<String> result = IKAnalyzerUtil.cut(text, model);
        result.add(text);
        return result;
    }
}

AnalyzerException

package com.ik.analyzer.ex;

/**
 * 描述: todo </br>
 * 时间: 2022-05-07 11:02  </br>
 * 作者:IT学习道场
 */
public class AnalyzerException extends RuntimeException {

    public AnalyzerException(String message) {
        super(message);
    }

    public AnalyzerException() {
    }

    public AnalyzerException(String message, Throwable cause) {
        super(message, cause);
    }

    public AnalyzerException(Throwable cause) {
        super(cause);
    }

    public AnalyzerException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) {
        super(message, cause, enableSuppression, writableStackTrace);
    }
}

AnalyzerModel

package com.ik.analyzer.constant;

/**
 * 描述: 分词器模式 </br>
 * 时间: 2022-05-09 8:31  </br>
 * 作者:IT学习道场
 */
public enum AnalyzerModel {
    useSmart("useSmart","智能分词器模式",true),
    unUseSmart("unUseSmart","最大限度分词模式",false),
    ;
    /**分词模式code*/
    private String code;
    /**模式描述*/
    private String desc;
    /**启用*/
    private boolean enable;

    AnalyzerModel(String code, String desc, boolean enable) {
        this.code = code;
        this.desc = desc;
        this.enable = enable;
    }

    public String getCode() {
        return code;
    }

    public void setCode(String code) {
        this.code = code;
    }

    public String getDesc() {
        return desc;
    }

    public void setDesc(String desc) {
        this.desc = desc;
    }

    public boolean isEnable() {
        return enable;
    }

    public void setEnable(boolean enable) {
        this.enable = enable;
    }
}

IKAnalyzerAutoConfiguration

package com.ik.analyzer.config;


import com.ik.analyzer.ik.IKAnalyzer;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

/**
 * 描述: ik分词配置 </br>
 * 时间: 2022-05-07 11:05  </br>
 * 作者:IT学习道场
 */
@Configuration
public class IKAnalyzerAutoConfiguration {

    @Bean
    @ConditionalOnProperty("analyzer.ik.enable")
    public IKAnalyzer iKAnalyzer(){
        IKAnalyzer ikAnalyzer = new IKAnalyzer();
        return ikAnalyzer;
    }
}

spring.factories

org.springframework.boot.autoconfigure.EnableAutoConfiguration=\
  com.ik.analyzer.config.IKAnalyzerAutoConfiguration

IKAnalyzer.cfg.xml

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
    <comment>IK Analyzer 扩展配置</comment>
    <!--用户可以在这里配置自己的扩展字典 -->
    <entry key="ext_dict">local.dic;</entry>
    <!--用户可以在这里配置自己的扩展停止词字典 -->
    <entry key="ext_stopwords">stop.dic;</entry>
</properties>

local.dic

中国太平
干饭人

stop.dic,暂时没给禁用关键词,所以是空的

使用就可以在需要的 项目中的pom下,引入

<dependency>
   <groupId>com.wlc.cloud</groupId>
   <artifactId>ik-analyzer-spring-boot-starter</artifactId>
   <version>1.0-SNAPSHOT</version>
</dependency>

实例:

@Slf4j
@RestController
@RequestMapping("/test")
public class TestController {
  @Autowired
  IKAnalyzer ikAnalyzer;
  
  @GetMapping("/analyseStr")
  public List<String> analyseStr() throws IOException {
    String text = "我们是一群干饭人";
    //默认是 useSmart-智能分词器模式
    List<String> list = ikAnalyzer.analyseStr(text);
    return list;
  }

}

响应结果:

{
  "code": 200,
  "success": true,
  "msg": "success",
  "data": [
    "我们",
    "是",
    "一群",
    "干饭人",
    "我们是一群干饭人"
  ]
}

保留了原文,如果你不需要原文,可以把原文去掉。需要修改

ikAnalyzer.analyseStr的方法

java文本分词-鸿蒙开发者社区

最大限度分词模式测试

@Slf4j
@RestController
@RequestMapping("/test")
public class TestController {
  @Autowired
  IKAnalyzer ikAnalyzer;
  
  @GetMapping("/analyseStr")
  public List<String> analyseStr() throws IOException {
    String text = "我们是一群干饭人";
    List<String> list = ikAnalyzer.analyseStr(text, AnalyzerModel.unUseSmart);
    return list;
  }
  
  
  
}

响应结果:

{
  "code": 200,
  "success": true,
  "msg": "success",
  "data": [
    "我们",
    "我",
    "们",
    "是",
    "一群",
    "一",
    "群",
    "干饭人",
    "干饭",
    "人",
    "我们是一群干饭人"
  ]
}

自己去玩玩吧




文章转载自公众号:IT学习道场

分类
标签
已于2023-7-3 11:56:56修改
收藏
回复
举报
回复
    相关推荐