Lucene 集成

* Lucene 集成
* nickname -> account 遗漏接口修改
This commit is contained in:
ronger 2021-05-28 19:00:27 +08:00 committed by GitHub
commit 3bd1734ad9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
72 changed files with 294520 additions and 118 deletions

4
.gitignore vendored
View File

@ -30,3 +30,7 @@ build/
### VS Code ###
.vscode/
### lucene ###
index
userDic

37
pom.xml
View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.2.6.RELEASE</version>
<version>2.3.5.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>com.rymcu</groupId>
@ -17,6 +17,7 @@
<properties>
<java.version>1.8</java.version>
<lucene.version>8.0.0</lucene.version>
</properties>
<dependencies>
@ -209,6 +210,40 @@
</exclusion>
</exclusions>
</dependency>
<!-- lucene -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-suggest</artifactId>
<version>${lucene.version}</version>
</dependency>
<!-- hutool 核心工具包 -->
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-core</artifactId>
<version>5.5.9</version>
</dependency>
</dependencies>
<build>

View File

@ -0,0 +1,88 @@
package com.rymcu.forest.config;
import org.aspectj.lang.JoinPoint;
import org.aspectj.lang.annotation.AfterReturning;
import org.aspectj.lang.annotation.Aspect;
import org.aspectj.lang.annotation.Before;
import org.aspectj.lang.annotation.Pointcut;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import org.springframework.web.context.request.RequestContextHolder;
import org.springframework.web.context.request.ServletRequestAttributes;
import javax.servlet.http.HttpServletRequest;
import java.util.Arrays;
import java.util.Enumeration;
/**
* WebLogAspect 实现Web层的日志切面
*
* @author suwen
* @date 2020/12/22 9:16 上午
*/
@Aspect
@Component
public class WebLogAspect {
private final Logger logger = LoggerFactory.getLogger(this.getClass());
ThreadLocal<Long> startTime = new ThreadLocal<Long>();
/**
* 定义一个切入点. 解释下
*
* <p>~ 第一个 * 代表任意修饰符及任意返回值. ~ 第二个 * 任意包名 ~ 第三个 * 代表任意方法.
*
* <p>~ 第四个 * 定义在web包或者子包 ~ 第五个 * 任意方法 ~ .. 匹配任意数量的参数. execution(*
* xiao.ze.demo.service.impl.*.*(..))
*/
@Pointcut("execution(* com.rymcu.forest.*.api.*.*.*(..))")
public void webLog() {
}
@Before("webLog()")
public void doBefore(JoinPoint joinPoint) {
startTime.set(System.currentTimeMillis());
// 接收到请求记录请求内容
logger.info("WebLogAspect.doBefore()");
ServletRequestAttributes attributes =
(ServletRequestAttributes) RequestContextHolder.getRequestAttributes();
HttpServletRequest request = attributes.getRequest();
// 记录下请求内容
logger.info("URL : " + request.getRequestURL().toString());
logger.info("HTTP_METHOD : " + request.getMethod());
logger.info("IP : " + request.getRemoteAddr());
logger.info(
"CLASS_METHOD : "
+ joinPoint.getSignature().getDeclaringTypeName()
+ "."
+ joinPoint.getSignature().getName());
logger.info("ARGS : " + Arrays.toString(joinPoint.getArgs())
.replaceAll("(?<=password).*?(?=(nickname|$))", "=****, ")
.replaceAll("(?<=password).*?(?=(\\)|$))", "=****)]")
.replaceAll("(?<=password).*?(?=(code|$))", "=****, "));
// 获取所有参数方法一
Enumeration<String> enu = request.getParameterNames();
while (enu.hasMoreElements()) {
String paraName = enu.nextElement();
if ("password".equals(paraName)) {
continue;
}
logger.info(paraName + ": " + request.getParameter(paraName));
}
}
@AfterReturning("webLog()")
public void doAfterReturning(JoinPoint joinPoint) {
// 处理完请求返回内容
logger.info("WebLogAspect.doAfterReturning()");
logger.info("耗时(毫秒) : " + (System.currentTimeMillis() - startTime.get()));
}
}

View File

@ -71,7 +71,7 @@ public class WebMvcConfigurer extends WebMvcConfigurationSupport {
public void addInterceptors(InterceptorRegistry registry) {
registry.addInterceptor(restAuthTokenInterceptor()).addPathPatterns("/api/**")
.excludePathPatterns("/api/v1/console/**", "/api/v1/article/articles/**", "/api/v1/article/detail/**"
, "/api/v1/topic/**", "/api/v1/user/**", "/api/v1/article/*/comments", "/api/v1/rule/currency/**");
, "/api/v1/topic/**", "/api/v1/user/**", "/api/v1/article/*/comments", "/api/v1/rule/currency/**", "/api/v1/lucene/**");
}

View File

@ -0,0 +1,203 @@
package com.rymcu.forest.lucene.api;
import com.github.pagehelper.Page;
import com.github.pagehelper.PageInfo;
import com.rymcu.forest.core.result.GlobalResult;
import com.rymcu.forest.core.result.GlobalResultGenerator;
import com.rymcu.forest.dto.ArticleDTO;
import com.rymcu.forest.dto.PortfolioDTO;
import com.rymcu.forest.dto.UserDTO;
import com.rymcu.forest.lucene.model.ArticleLucene;
import com.rymcu.forest.lucene.model.PortfolioLucene;
import com.rymcu.forest.lucene.model.UserLucene;
import com.rymcu.forest.lucene.service.LuceneService;
import com.rymcu.forest.lucene.service.PortfolioLuceneService;
import com.rymcu.forest.lucene.service.UserDicService;
import com.rymcu.forest.lucene.service.UserLuceneService;
import com.rymcu.forest.lucene.util.ArticleIndexUtil;
import com.rymcu.forest.lucene.util.PortfolioIndexUtil;
import com.rymcu.forest.lucene.util.UserIndexUtil;
import com.rymcu.forest.util.Utils;
import org.springframework.web.bind.annotation.*;
import javax.annotation.PostConstruct;
import javax.annotation.Resource;
import java.io.FileNotFoundException;
import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* LuceneSearchController
*
* @author suwen
* @date 2021/2/3 10:41
*/
@RestController
@RequestMapping("/api/v1/lucene")
public class LuceneSearchController {
@Resource
private LuceneService luceneService;
@Resource
private UserLuceneService userLuceneService;
@Resource
private PortfolioLuceneService portfolioLuceneService;
@Resource
private UserDicService dicService;
@PostConstruct
public void createIndex() {
// 删除系统运行时保存的索引重新创建索引
ArticleIndexUtil.deleteAllIndex();
UserIndexUtil.deleteAllIndex();
PortfolioIndexUtil.deleteAllIndex();
ExecutorService executor = Executors.newSingleThreadExecutor();
CompletableFuture<String> future =
CompletableFuture.supplyAsync(
() -> {
System.out.println(">>>>>>>>> 开始创建索引 <<<<<<<<<<<");
luceneService.writeArticle(luceneService.getAllArticleLucene());
userLuceneService.writeUser(userLuceneService.getAllUserLucene());
portfolioLuceneService.writePortfolio(portfolioLuceneService.getAllPortfolioLucene());
System.out.println(">>>>>>>>> 索引创建完毕 <<<<<<<<<<<");
System.out.println("加载用户配置的自定义扩展词典到主词库表");
try {
System.out.println(">>>>>>>>> 开始加载用户词典 <<<<<<<<<<<");
dicService.writeUserDic();
} catch (FileNotFoundException e) {
System.out.println("加载用户词典失败,未成功创建用户词典");
}
return ">>>>>>>>> 加载用户词典完毕 <<<<<<<<<<<";
},
executor);
future.thenAccept(System.out::println);
}
/**
* 文章搜索实现高亮
*
* @param q
* @return
*/
@GetMapping("/search-article")
public GlobalResult<?> searchArticle(
@RequestParam String q,
@RequestParam(defaultValue = "1") Integer page,
@RequestParam(defaultValue = "10") Integer rows) {
// 找出相关文章相关度倒序
List<ArticleLucene> resList = luceneService.searchArticle(q);
// 分页组装文章详情
int total = resList.size();
if (total == 0) {
return GlobalResultGenerator.genSuccessResult("未找到相关文章");
}
Page<ArticleDTO> articles = new Page<>(page, rows);
articles.setTotal(total);
int startIndex = (page - 1) * rows;
int endIndex = Math.min(startIndex + rows, total);
// 分割子列表
List<ArticleLucene> subList = resList.subList(startIndex, endIndex);
String[] ids = subList.stream().map(ArticleLucene::getIdArticle).toArray(String[]::new);
List<ArticleDTO> articleDTOList = luceneService.getArticlesByIds(ids);
ArticleDTO temp;
// 写入文章关键词信息
for (int i = 0; i < articleDTOList.size(); i++) {
temp = articleDTOList.get(i);
temp.setArticleTitle(subList.get(i).getArticleTitle());
if (subList.get(i).getArticleContent().length() > 10) {
// 内容中命中太少则不替换
temp.setArticlePreviewContent(subList.get(i).getArticleContent());
}
articleDTOList.set(i, temp);
}
articles.addAll(articleDTOList);
PageInfo<ArticleDTO> pageInfo = new PageInfo<>(articles);
return GlobalResultGenerator.genSuccessResult(Utils.getArticlesGlobalResult(pageInfo));
}
/**
* 用户搜索实现高亮
*
* @param q
* @return
*/
@GetMapping("/search-user")
public GlobalResult<?> searchUser(
@RequestParam String q,
@RequestParam(defaultValue = "1") Integer page,
@RequestParam(defaultValue = "10") Integer rows) {
// 找出相关文章相关度倒序
List<UserLucene> resList = userLuceneService.searchUser(q);
// 分页组装文章详情
int total = resList.size();
if (total == 0) {
return GlobalResultGenerator.genSuccessResult("未找到相关用户");
}
Page<UserDTO> users = new Page<>(page, rows);
users.setTotal(total);
int startIndex = (page - 1) * rows;
int endIndex = Math.min(startIndex + rows, total);
// 分割子列表
List<UserLucene> subList = resList.subList(startIndex, endIndex);
Integer[] ids = subList.stream().map(UserLucene::getIdUser).toArray(Integer[]::new);
List<UserDTO> userDTOList = userLuceneService.getUsersByIds(ids);
UserDTO temp;
// 写入文章关键词信息
for (int i = 0; i < userDTOList.size(); i++) {
temp = userDTOList.get(i);
temp.setNickname(subList.get(i).getNickname());
if (subList.get(i).getSignature().length() > 10) {
// 内容中命中太少则不替换
temp.setSignature(subList.get(i).getSignature());
}
userDTOList.set(i, temp);
}
users.addAll(userDTOList);
PageInfo<UserDTO> pageInfo = new PageInfo<>(users);
return GlobalResultGenerator.genSuccessResult(Utils.getUserGlobalResult(pageInfo));
}
/**
* 作品集搜索实现高亮
*
* @param q
* @return
*/
@GetMapping("/search-portfolio")
public GlobalResult<?> searchPortfolio(
@RequestParam String q,
@RequestParam(defaultValue = "1") Integer page,
@RequestParam(defaultValue = "10") Integer rows) {
// 找出相关文章相关度倒序
List<PortfolioLucene> resList = portfolioLuceneService.searchPortfolio(q);
// 分页组装文章详情
int total = resList.size();
if (total == 0) {
return GlobalResultGenerator.genSuccessResult("未找到相关作品集");
}
Page<PortfolioDTO> portfolios = new Page<>(page, rows);
portfolios.setTotal(total);
int startIndex = (page - 1) * rows;
int endIndex = Math.min(startIndex + rows, total);
// 分割子列表
List<PortfolioLucene> subList = resList.subList(startIndex, endIndex);
String[] ids = subList.stream().map(PortfolioLucene::getIdPortfolio).toArray(String[]::new);
List<PortfolioDTO> portfolioDTOList = portfolioLuceneService.getPortfoliosByIds(ids);
PortfolioDTO temp;
// 写入文章关键词信息
for (int i = 0; i < portfolioDTOList.size(); i++) {
temp = portfolioDTOList.get(i);
temp.setPortfolioTitle(subList.get(i).getPortfolioTitle());
if (subList.get(i).getPortfolioDescription().length() > 10) {
// 内容中命中太少则不替换
temp.setPortfolioDescription(subList.get(i).getPortfolioDescription());
}
portfolioDTOList.set(i, temp);
}
portfolios.addAll(portfolioDTOList);
PageInfo<PortfolioDTO> pageInfo = new PageInfo<>(portfolios);
return GlobalResultGenerator.genSuccessResult(Utils.getPortfolioGlobalResult(pageInfo));
}
}

View File

@ -0,0 +1,60 @@
package com.rymcu.forest.lucene.api;
import com.github.pagehelper.PageHelper;
import com.github.pagehelper.PageInfo;
import com.rymcu.forest.core.result.GlobalResult;
import com.rymcu.forest.core.result.GlobalResultGenerator;
import com.rymcu.forest.lucene.model.UserDic;
import com.rymcu.forest.lucene.service.UserDicService;
import com.rymcu.forest.util.Utils;
import org.springframework.web.bind.annotation.*;
import javax.annotation.Resource;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* UserDicController
*
* @author suwen
* @date 2021/2/4 09:29
*/
@RestController
@RequestMapping("/api/v1/lucene/dic")
public class UserDicController {
@Resource private UserDicService dicService;
@GetMapping("/getAll")
public GlobalResult getAll(
@RequestParam(defaultValue = "0") Integer page,
@RequestParam(defaultValue = "10") Integer rows) {
PageHelper.startPage(page, rows);
List<UserDic> list = dicService.getAll();
PageInfo<UserDic> pageInfo = new PageInfo<>(list);
Map<String, Object> map = new HashMap<>(2);
map.put("userDic", pageInfo.getList());
Map pagination = Utils.getPagination(pageInfo);
map.put("pagination", pagination);
return GlobalResultGenerator.genSuccessResult(map);
}
@PostMapping("/addDic/{dic}")
public GlobalResult addDic(@PathVariable String dic) {
dicService.addDic(dic);
return GlobalResultGenerator.genSuccessResult("新增字典成功");
}
@PutMapping("/editDic")
public GlobalResult getAllDic(@RequestBody UserDic dic) {
dicService.updateDic(dic);
return GlobalResultGenerator.genSuccessResult("更新字典成功");
}
@DeleteMapping("/deleteDic/{id}")
public GlobalResult deleteDic(@PathVariable String id) {
dicService.deleteDic(id);
return GlobalResultGenerator.genSuccessResult("删除字典成功");
}
}

View File

@ -0,0 +1,75 @@
/**
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package com.rymcu.forest.lucene.cfg;
import java.util.List;
/**
*
* 配置管理类接口
*
*/
public interface Configuration {
/**
* 返回useSmart标志位
* useSmart =true 分词器使用智能切分策略 =false则使用细粒度切分
* @return useSmart
*/
boolean useSmart();
/**
* 设置useSmart标志位
* useSmart =true 分词器使用智能切分策略 =false则使用细粒度切分
* @param useSmart
*/
void setUseSmart(boolean useSmart);
/**
* 获取主词典路径
*
* @return String 主词典路径
*/
String getMainDictionary();
/**
* 获取量词词典路径
* @return String 量词词典路径
*/
String getQuantifierDictionary();
/**
* 获取扩展字典配置路径
* @return List<String> 相对类加载器的路径
*/
List<String> getExtDictionary();
/**
* 获取扩展停止词典配置路径
* @return List<String> 相对类加载器的路径
*/
List<String> getExtStopWordDictionary();
}

View File

@ -0,0 +1,133 @@
/**
* IK 中文分词 版本 5.0 IK Analyzer release 5.0
*
* <p>Licensed to the Apache Software Foundation (ASF) under one or more contributor license
* agreements. See the NOTICE file distributed with this work for additional information regarding
* copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License. You may obtain a
* copy of the License at
*
* <p>http://www.apache.org/licenses/LICENSE-2.0
*
* <p>Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*
* <p>源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012乌龙茶工作室 provided by Linliangyi and copyright 2012
* by Oolong studio
*/
package com.rymcu.forest.lucene.cfg;
import org.springframework.stereotype.Component;
import java.util.ArrayList;
import java.util.List;
/** Configuration 默认实现 2012-5-8 */
@Component
public class DefaultConfig implements Configuration {
/** 分词器默认字典路径 */
private static final String PATH_DIC_MAIN = "lucene/main2012.dic";
/** 题词字典路径 */
private static final String PATH_DIC_QUANTIFIER = "lucene/quantifier.dic";
/** 用户自定义字典路径 */
private static final String PATH_USER_DIC =
System.getProperty("user.dir") + "/lucene/userDic/userDic.dic";
/** 配置属性——扩展字典 */
private String extDic = "lucene/ext.dic;" + PATH_USER_DIC;
/** 配置属性——扩展停止词典 */
private String extStopword = "lucene/stopword.dic";
/** 是否使用smart方式分词 */
private boolean useSmart;
/**
* 返回单例
*
* @return Configuration单例
*/
public static Configuration getInstance() {
return new DefaultConfig();
}
/**
* 返回useSmart标志位 useSmart =true 分词器使用智能切分策略 =false则使用细粒度切分
*
* @return useSmart
*/
@Override
public boolean useSmart() {
return useSmart;
}
/**
* 设置useSmart标志位 useSmart =true 分词器使用智能切分策略 =false则使用细粒度切分
*
* @param useSmart
*/
@Override
public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
/**
* 获取主词典路径
*
* @return String 主词典路径
*/
@Override
public String getMainDictionary() {
return PATH_DIC_MAIN;
}
/**
* 获取量词词典路径
*
* @return String 量词词典路径
*/
@Override
public String getQuantifierDictionary() {
return PATH_DIC_QUANTIFIER;
}
/**
* 获取扩展字典配置路径
*
* @return List<String> 相对类加载器的路径
*/
@Override
public List<String> getExtDictionary() {
List<String> extDictFiles = new ArrayList<String>(2);
if (extDic != null) {
// 使用;分割多个扩展字典配置
String[] filePaths = extDic.split(";");
for (String filePath : filePaths) {
if (filePath != null && !"".equals(filePath.trim())) {
extDictFiles.add(filePath.trim());
}
}
}
return extDictFiles;
}
/**
* 获取扩展停止词典配置路径
*
* @return List<String> 相对类加载器的路径
*/
@Override
public List<String> getExtStopWordDictionary() {
List<String> extStopWordDictFiles = new ArrayList<>(2);
if (extStopword != null) {
// 使用;分割多个扩展字典配置
String[] filePaths = extStopword.split(";");
for (String filePath : filePaths) {
if (filePath != null && !"".equals(filePath.trim())) {
extStopWordDictFiles.add(filePath.trim());
}
}
}
return extStopWordDictFiles;
}
}

View File

@ -0,0 +1,353 @@
/**
* IK 中文分词 版本 5.0 IK Analyzer release 5.0
*
* <p>Licensed to the Apache Software Foundation (ASF) under one or more contributor license
* agreements. See the NOTICE file distributed with this work for additional information regarding
* copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License. You may obtain a
* copy of the License at
*
* <p>http://www.apache.org/licenses/LICENSE-2.0
*
* <p>Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*
* <p>源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012乌龙茶工作室 provided by Linliangyi and copyright 2012
* by Oolong studio
*/
package com.rymcu.forest.lucene.core;
import com.rymcu.forest.lucene.cfg.Configuration;
import com.rymcu.forest.lucene.dic.Dictionary;
import java.io.IOException;
import java.io.Reader;
import java.util.*;
/** 分词器上下文状态 */
class AnalyzeContext {
/** 默认缓冲区大小 */
private static final int BUFF_SIZE = 4096;
/** 缓冲区耗尽的临界值 */
private static final int BUFF_EXHAUST_CRITICAL = 100;
/** 字符窜读取缓冲 */
private char[] segmentBuff;
/** 字符类型数组 */
private int[] charTypes;
/** 记录Reader内已分析的字串总长度 在分多段分析词元时该变量累计当前的segmentBuff相对于reader起始位置的位移 */
private int buffOffset;
/** 当前缓冲区位置指针 */
private int cursor;
/** 最近一次读入的,可处理的字串长度 */
private int available;
/** 子分词器锁, 该集合非空说明有子分词器在占用segmentBuff */
private final Set<String> buffLocker;
/** 原始分词结果集合,未经歧义处理 */
private QuickSortSet orgLexemes;
/** LexemePath位置索引表 */
private final Map<Integer, LexemePath> pathMap;
/** 最终分词结果集 */
private final LinkedList<Lexeme> results;
/** 分词器配置项 */
private final Configuration cfg;
public AnalyzeContext(Configuration cfg) {
this.cfg = cfg;
this.segmentBuff = new char[BUFF_SIZE];
this.charTypes = new int[BUFF_SIZE];
this.buffLocker = new HashSet<>();
this.orgLexemes = new QuickSortSet();
this.pathMap = new HashMap<>();
this.results = new LinkedList<>();
}
int getCursor() {
return this.cursor;
}
char[] getSegmentBuff() {
return this.segmentBuff;
}
char getCurrentChar() {
return this.segmentBuff[this.cursor];
}
int getCurrentCharType() {
return this.charTypes[this.cursor];
}
int getBufferOffset() {
return this.buffOffset;
}
/**
* 根据context的上下文情况填充segmentBuff
*
* @param reader
* @return 返回待分析的有效的字串长度
* @throws IOException
*/
int fillBuffer(Reader reader) throws IOException {
int readCount = 0;
if (this.buffOffset == 0) {
// 首次读取reader
readCount = reader.read(segmentBuff);
} else {
int offset = this.available - this.cursor;
if (offset > 0) {
// 最近一次读取的>最近一次处理的将未处理的字串拷贝到segmentBuff头部
System.arraycopy(this.segmentBuff, this.cursor, this.segmentBuff, 0, offset);
readCount = offset;
}
// 继续读取reader 以onceReadIn - onceAnalyzed为起始位置继续填充segmentBuff剩余的部分
readCount += reader.read(this.segmentBuff, offset, BUFF_SIZE - offset);
}
// 记录最后一次从Reader中读入的可用字符长度
this.available = readCount;
// 重置当前指针
this.cursor = 0;
return readCount;
}
/** 初始化buff指针处理第一个字符 */
void initCursor() {
this.cursor = 0;
this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
}
/** 指针+1 成功返回 true 指针已经到了buff尾部不能前进返回false 并处理当前字符 */
boolean moveCursor() {
if (this.cursor < this.available - 1) {
this.cursor++;
this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
return true;
} else {
return false;
}
}
/**
* 设置当前segmentBuff为锁定状态 加入占用segmentBuff的子分词器名称表示占用segmentBuff
*
* @param segmenterName
*/
void lockBuffer(String segmenterName) {
this.buffLocker.add(segmenterName);
}
/**
* 移除指定的子分词器名释放对segmentBuff的占用
*
* @param segmenterName
*/
void unlockBuffer(String segmenterName) {
this.buffLocker.remove(segmenterName);
}
/**
* 只要buffLocker中存在segmenterName 则buffer被锁定
*
* @return boolean 缓冲去是否被锁定
*/
boolean isBufferLocked() {
return this.buffLocker.size() > 0;
}
/**
* 判断当前segmentBuff是否已经用完 当前执针cursor移至segmentBuff末端this.available - 1
*
* @return
*/
boolean isBufferConsumed() {
return this.cursor == this.available - 1;
}
/**
* 判断segmentBuff是否需要读取新数据
*
* <p>满足一下条件时 1.available == BUFF_SIZE 表示buffer满载 2.buffIndex < available - 1 && buffIndex >
* available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内 3.!context.isBufferLocked()表示没有segmenter在占用buffer
* 要中断当前循环buffer要进行移位并再读取数据的操作
*
* @return
*/
boolean needRefillBuffer() {
return this.available == BUFF_SIZE
&& this.cursor < this.available - 1
&& this.cursor > this.available - BUFF_EXHAUST_CRITICAL
&& !this.isBufferLocked();
}
/** 累计当前的segmentBuff相对于reader起始位置的位移 */
void markBufferOffset() {
this.buffOffset += this.cursor;
}
/**
* 向分词结果集添加词元
*
* @param lexeme
*/
void addLexeme(Lexeme lexeme) {
this.orgLexemes.addLexeme(lexeme);
}
/**
* 添加分词结果路径 路径起始位置 ---> 路径 映射表
*
* @param path
*/
void addLexemePath(LexemePath path) {
if (path != null) {
this.pathMap.put(path.getPathBegin(), path);
}
}
/**
* 返回原始分词结果
*
* @return
*/
QuickSortSet getOrgLexemes() {
return this.orgLexemes;
}
/**
* 推送分词结果到结果集合 1.从buff头部遍历到this.cursor已处理位置 2.将map中存在的分词结果推入results
* 3.将map中不存在的CJDK字符以单字方式推入results
*/
void outputToResult() {
int index = 0;
while (index <= this.cursor) {
// 跳过非CJK字符
if (CharacterUtil.CHAR_USELESS == this.charTypes[index]) {
index++;
continue;
}
// 从pathMap找出对应index位置的LexemePath
LexemePath path = this.pathMap.get(index);
if (path != null) {
// 输出LexemePath中的lexeme到results集合
Lexeme l = path.pollFirst();
while (l != null) {
this.results.add(l);
// 将index移至lexeme后
index = l.getBegin() + l.getLength();
l = path.pollFirst();
if (l != null) {
// 输出path内部词元间遗漏的单字
for (; index < l.getBegin(); index++) {
this.outputSingleCJK(index);
}
}
}
} else { // pathMap中找不到index对应的LexemePath
// 单字输出
this.outputSingleCJK(index);
index++;
}
}
// 清空当前的Map
this.pathMap.clear();
}
/**
* 对CJK字符进行单字输出
*
* @param index
*/
private void outputSingleCJK(int index) {
if (CharacterUtil.CHAR_CHINESE == this.charTypes[index]) {
Lexeme singleCharLexeme = new Lexeme(this.buffOffset, index, 1, Lexeme.TYPE_CNCHAR);
this.results.add(singleCharLexeme);
} else if (CharacterUtil.CHAR_OTHER_CJK == this.charTypes[index]) {
Lexeme singleCharLexeme = new Lexeme(this.buffOffset, index, 1, Lexeme.TYPE_OTHER_CJK);
this.results.add(singleCharLexeme);
}
}
/**
* 返回lexeme
*
* <p>同时处理合并
*
* @return
*/
Lexeme getNextLexeme() {
// 从结果集取出并移除第一个Lexme
Lexeme result = this.results.pollFirst();
while (result != null) {
// 数量词合并
this.compound(result);
if (Dictionary.getSingleton()
.isStopWord(this.segmentBuff, result.getBegin(), result.getLength())) {
// 是停止词继续取列表的下一个
result = this.results.pollFirst();
} else {
// 不是停止词, 生成lexeme的词元文本,输出
result.setLexemeText(String.valueOf(segmentBuff, result.getBegin(), result.getLength()));
break;
}
}
return result;
}
/** 重置分词上下文状态 */
void reset() {
this.buffLocker.clear();
this.orgLexemes = new QuickSortSet();
this.available = 0;
this.buffOffset = 0;
this.charTypes = new int[BUFF_SIZE];
this.cursor = 0;
this.results.clear();
this.segmentBuff = new char[BUFF_SIZE];
this.pathMap.clear();
}
/** 组合词元 */
private void compound(Lexeme result) {
if (!this.cfg.useSmart()) {
return;
}
// 数量词合并处理
if (!this.results.isEmpty()) {
if (Lexeme.TYPE_ARABIC == result.getLexemeType()) {
Lexeme nextLexeme = this.results.peekFirst();
boolean appendOk = false;
if (Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()) {
// 合并英文数词+中文数词
appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
} else if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
// 合并英文数词+中文量词
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
}
if (appendOk) {
// 弹出
this.results.pollFirst();
}
}
// 可能存在第二轮合并
if (Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()) {
Lexeme nextLexeme = this.results.peekFirst();
boolean appendOk = false;
if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
// 合并中文数词+中文量词
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
}
if (appendOk) {
// 弹出
this.results.pollFirst();
}
}
}
}
}

View File

@ -0,0 +1,119 @@
/**
* IK 中文分词 版本 5.0 IK Analyzer release 5.0
*
* <p>Licensed to the Apache Software Foundation (ASF) under one or more contributor license
* agreements. See the NOTICE file distributed with this work for additional information regarding
* copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License. You may obtain a
* copy of the License at
*
* <p>http://www.apache.org/licenses/LICENSE-2.0
*
* <p>Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*
* <p>源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012乌龙茶工作室 provided by Linliangyi and copyright 2012
* by Oolong studio
*/
package com.rymcu.forest.lucene.core;
import com.rymcu.forest.lucene.dic.Dictionary;
import com.rymcu.forest.lucene.dic.Hit;
import java.util.LinkedList;
import java.util.List;
/** 中文-日韩文子分词器 */
class CJKSegmenter implements ISegmenter {
/** 子分词器标签 */
static final String SEGMENTER_NAME = "CJK_SEGMENTER";
/** 待处理的分词hit队列 */
private List<Hit> tmpHits;
CJKSegmenter() {
this.tmpHits = new LinkedList<Hit>();
}
@Override
public void analyze(AnalyzeContext context) {
if (CharacterUtil.CHAR_USELESS != context.getCurrentCharType()) {
// 优先处理tmpHits中的hit
if (!this.tmpHits.isEmpty()) {
// 处理词段队列
Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
for (Hit hit : tmpArray) {
hit =
Dictionary.getSingleton()
.matchWithHit(context.getSegmentBuff(), context.getCursor(), hit);
if (hit.isMatch()) {
// 输出当前的词
Lexeme newLexeme =
new Lexeme(
context.getBufferOffset(),
hit.getBegin(),
context.getCursor() - hit.getBegin() + 1,
Lexeme.TYPE_CNWORD);
context.addLexeme(newLexeme);
if (!hit.isPrefix()) { // 不是词前缀hit不需要继续匹配移除
this.tmpHits.remove(hit);
}
} else if (hit.isUnmatch()) {
// hit不是词移除
this.tmpHits.remove(hit);
}
}
}
// 再对当前指针位置的字符进行单字匹配
Hit singleCharHit =
Dictionary.getSingleton()
.matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
if (singleCharHit.isMatch()) { // 首字成词
// 输出当前的词
Lexeme newLexeme =
new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_CNWORD);
context.addLexeme(newLexeme);
// 同时也是词前缀
if (singleCharHit.isPrefix()) {
// 前缀匹配则放入hit列表
this.tmpHits.add(singleCharHit);
}
} else if (singleCharHit.isPrefix()) { // 首字为词前缀
// 前缀匹配则放入hit列表
this.tmpHits.add(singleCharHit);
}
} else {
// 遇到CHAR_USELESS字符
// 清空队列
this.tmpHits.clear();
}
// 判断缓冲区是否已经读完
if (context.isBufferConsumed()) {
// 清空队列
this.tmpHits.clear();
}
// 判断是否锁定缓冲区
if (this.tmpHits.size() == 0) {
context.unlockBuffer(SEGMENTER_NAME);
} else {
context.lockBuffer(SEGMENTER_NAME);
}
}
@Override
public void reset() {
// 清空队列
this.tmpHits.clear();
}
}

View File

@ -0,0 +1,226 @@
/**
* IK 中文分词 版本 5.0 IK Analyzer release 5.0
*
* <p>Licensed to the Apache Software Foundation (ASF) under one or more contributor license
* agreements. See the NOTICE file distributed with this work for additional information regarding
* copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License. You may obtain a
* copy of the License at
*
* <p>http://www.apache.org/licenses/LICENSE-2.0
*
* <p>Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*
* <p>源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012乌龙茶工作室 provided by Linliangyi and copyright 2012
* by Oolong studio
*/
package com.rymcu.forest.lucene.core;
import com.rymcu.forest.lucene.dic.Dictionary;
import com.rymcu.forest.lucene.dic.Hit;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
/** 中文数量词子分词器 */
class CN_QuantifierSegmenter implements ISegmenter {
/** 子分词器标签 */
static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
/** 中文数词 */
private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";
private static Set<Character> ChnNumberChars = new HashSet<>();
static {
char[] ca = Chn_Num.toCharArray();
for (char nChar : ca) {
ChnNumberChars.add(nChar);
}
}
/** 词元的开始位置, 同时作为子分词器状态标识 当start > -1 时,标识当前的分词器正在处理字符 */
private int nStart;
/** 记录词元结束位置 end记录的是在词元中最后一个出现的合理的数词结束 */
private int nEnd;
/** 待处理的量词hit队列 */
private final List<Hit> countHits;
CN_QuantifierSegmenter() {
nStart = -1;
nEnd = -1;
this.countHits = new LinkedList<Hit>();
}
/** 分词 */
@Override
public void analyze(AnalyzeContext context) {
// 处理中文数词
this.processCNumber(context);
// 处理中文量词
this.processCount(context);
// 判断是否锁定缓冲区
if (this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()) {
// 对缓冲区解锁
context.unlockBuffer(SEGMENTER_NAME);
} else {
context.lockBuffer(SEGMENTER_NAME);
}
}
/** 重置子分词器状态 */
@Override
public void reset() {
nStart = -1;
nEnd = -1;
countHits.clear();
}
/** 处理数词 */
private void processCNumber(AnalyzeContext context) {
if (nStart == -1 && nEnd == -1) { // 初始状态
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
&& ChnNumberChars.contains(context.getCurrentChar())) {
// 记录数词的起始结束位置
nStart = context.getCursor();
nEnd = context.getCursor();
}
} else { // 正在处理状态
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
&& ChnNumberChars.contains(context.getCurrentChar())) {
// 记录数词的结束位置
nEnd = context.getCursor();
} else {
// 输出数词
this.outputNumLexeme(context);
// 重置头尾指针
nStart = -1;
nEnd = -1;
}
}
// 缓冲区已经用完还有尚未输出的数词
if (context.isBufferConsumed()) {
if (nStart != -1 && nEnd != -1) {
// 输出数词
outputNumLexeme(context);
// 重置头尾指针
nStart = -1;
nEnd = -1;
}
}
}
/**
* 处理中文量词
*
* @param context
*/
private void processCount(AnalyzeContext context) {
// 判断是否需要启动量词扫描
if (!this.needCountScan(context)) {
return;
}
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()) {
// 优先处理countHits中的hit
if (!this.countHits.isEmpty()) {
// 处理词段队列
Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]);
for (Hit hit : tmpArray) {
hit =
Dictionary.getSingleton()
.matchWithHit(context.getSegmentBuff(), context.getCursor(), hit);
if (hit.isMatch()) {
// 输出当前的词
Lexeme newLexeme =
new Lexeme(
context.getBufferOffset(),
hit.getBegin(),
context.getCursor() - hit.getBegin() + 1,
Lexeme.TYPE_COUNT);
context.addLexeme(newLexeme);
if (!hit.isPrefix()) { // 不是词前缀hit不需要继续匹配移除
this.countHits.remove(hit);
}
} else if (hit.isUnmatch()) {
// hit不是词移除
this.countHits.remove(hit);
}
}
}
// 对当前指针位置的字符进行单字匹配
Hit singleCharHit =
Dictionary.getSingleton()
.matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
if (singleCharHit.isMatch()) { // 首字成量词词
// 输出当前的词
Lexeme newLexeme =
new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_COUNT);
context.addLexeme(newLexeme);
// 同时也是词前缀
if (singleCharHit.isPrefix()) {
// 前缀匹配则放入hit列表
this.countHits.add(singleCharHit);
}
} else if (singleCharHit.isPrefix()) { // 首字为量词前缀
// 前缀匹配则放入hit列表
this.countHits.add(singleCharHit);
}
} else {
// 输入的不是中文字符
// 清空未成形的量词
this.countHits.clear();
}
// 缓冲区数据已经读完还有尚未输出的量词
if (context.isBufferConsumed()) {
// 清空未成形的量词
this.countHits.clear();
}
}
/**
* 判断是否需要扫描量词
*
* @return
*/
private boolean needCountScan(AnalyzeContext context) {
if ((nStart != -1 && nEnd != -1) || !countHits.isEmpty()) {
// 正在处理中文数词,或者正在处理量词
return true;
} else {
// 找到一个相邻的数词
if (!context.getOrgLexemes().isEmpty()) {
Lexeme l = context.getOrgLexemes().peekLast();
if (Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()) {
if (l.getBegin() + l.getLength() == context.getCursor()) {
return true;
}
}
}
}
return false;
}
/**
* 添加数词词元到结果集
*
* @param context
*/
private void outputNumLexeme(AnalyzeContext context) {
if (nStart > -1 && nEnd > -1) {
// 输出数词
Lexeme newLexeme =
new Lexeme(context.getBufferOffset(), nStart, nEnd - nStart + 1, Lexeme.TYPE_CNUM);
context.addLexeme(newLexeme);
}
}
}

View File

@ -0,0 +1,94 @@
/**
* IK 中文分词 版本 5.0 IK Analyzer release 5.0
*
* <p>Licensed to the Apache Software Foundation (ASF) under one or more contributor license
* agreements. See the NOTICE file distributed with this work for additional information regarding
* copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License. You may obtain a
* copy of the License at
*
* <p>http://www.apache.org/licenses/LICENSE-2.0
*
* <p>Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*
* <p>源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012乌龙茶工作室 provided by Linliangyi and copyright 2012
* by Oolong studio
*
* <p>字符集识别工具类
*/
package com.rymcu.forest.lucene.core;
/** 字符集识别工具类 */
class CharacterUtil {
public static final int CHAR_USELESS = 0;
public static final int CHAR_ARABIC = 0X00000001;
public static final int CHAR_ENGLISH = 0X00000002;
public static final int CHAR_CHINESE = 0X00000004;
public static final int CHAR_OTHER_CJK = 0X00000008;
/**
* 识别字符类型
*
* @param input
* @return int CharacterUtil定义的字符类型常量
*/
static int identifyCharType(char input) {
if (input >= '0' && input <= '9') {
return CHAR_ARABIC;
} else if ((input >= 'a' && input <= 'z') || (input >= 'A' && input <= 'Z')) {
return CHAR_ENGLISH;
} else {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A) {
// 目前已知的中文字符UTF-8集合
return CHAR_CHINESE;
} else if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS // 全角数字字符和日韩字符
// 韩文字符集
|| ub == Character.UnicodeBlock.HANGUL_SYLLABLES
|| ub == Character.UnicodeBlock.HANGUL_JAMO
|| ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
// 日文字符集
|| ub == Character.UnicodeBlock.HIRAGANA // 平假名
|| ub == Character.UnicodeBlock.KATAKANA // 片假名
|| ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS) {
return CHAR_OTHER_CJK;
}
}
// 其他的不做处理的字符
return CHAR_USELESS;
}
/**
* 进行字符规格化全角转半角大写转小写处理
*
* @param input
* @return char
*/
static char regularize(char input) {
if (input == 12288) {
input = (char) 32;
} else if (input > 65280 && input < 65375) {
input = (char) (input - 65248);
} else if (input >= 'A' && input <= 'Z') {
input += 32;
}
return input;
}
}

View File

@ -0,0 +1,144 @@
/**
* IK 中文分词 版本 5.0 IK Analyzer release 5.0
*
* <p>Licensed to the Apache Software Foundation (ASF) under one or more contributor license
* agreements. See the NOTICE file distributed with this work for additional information regarding
* copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License. You may obtain a
* copy of the License at
*
* <p>http://www.apache.org/licenses/LICENSE-2.0
*
* <p>Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*
* <p>源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012乌龙茶工作室 provided by Linliangyi and copyright 2012
* by Oolong studio
*/
package com.rymcu.forest.lucene.core;
import java.util.Stack;
import java.util.TreeSet;
/** IK分词歧义裁决器 */
class IKArbitrator {
IKArbitrator() {}
/**
* 分词歧义处理
*
* @param context
* @param useSmart
*/
void process(AnalyzeContext context, boolean useSmart) {
QuickSortSet orgLexemes = context.getOrgLexemes();
Lexeme orgLexeme = orgLexemes.pollFirst();
LexemePath crossPath = new LexemePath();
while (orgLexeme != null) {
if (!crossPath.addCrossLexeme(orgLexeme)) {
// 找到与crossPath不相交的下一个crossPath
if (crossPath.size() == 1 || !useSmart) {
// crossPath没有歧义 或者 不做歧义处理
// 直接输出当前crossPath
context.addLexemePath(crossPath);
} else {
// 对当前的crossPath进行歧义处理
QuickSortSet.Cell headCell = crossPath.getHead();
LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength());
// 输出歧义处理结果judgeResult
context.addLexemePath(judgeResult);
}
// 把orgLexeme加入新的crossPath中
crossPath = new LexemePath();
crossPath.addCrossLexeme(orgLexeme);
}
orgLexeme = orgLexemes.pollFirst();
}
// 处理最后的path
if (crossPath.size() == 1 || !useSmart) {
// crossPath没有歧义 或者 不做歧义处理
// 直接输出当前crossPath
context.addLexemePath(crossPath);
} else {
// 对当前的crossPath进行歧义处理
QuickSortSet.Cell headCell = crossPath.getHead();
LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength());
// 输出歧义处理结果judgeResult
context.addLexemePath(judgeResult);
}
}
/**
* 歧义识别
*
* @param lexemeCell 歧义路径链表头
* @param fullTextLength 歧义路径文本长度
* @param fullTextLength 候选结果路径
* @return
*/
private LexemePath judge(QuickSortSet.Cell lexemeCell, int fullTextLength) {
// 候选路径集合
TreeSet<LexemePath> pathOptions = new TreeSet<LexemePath>();
// 候选结果路径
LexemePath option = new LexemePath();
// 对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈
Stack<QuickSortSet.Cell> lexemeStack = this.forwardPath(lexemeCell, option);
// 当前词元链并非最理想的加入候选路径集合
pathOptions.add(option.copy());
// 存在歧义词处理
QuickSortSet.Cell c = null;
while (!lexemeStack.isEmpty()) {
c = lexemeStack.pop();
// 回滚词元链
this.backPath(c.getLexeme(), option);
// 从歧义词位置开始递归生成可选方案
this.forwardPath(c, option);
pathOptions.add(option.copy());
}
// 返回集合中的最优方案
return pathOptions.first();
}
/**
* 向前遍历添加词元构造一个无歧义词元组合
*
* @param option path
* @return
*/
private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell, LexemePath option) {
// 发生冲突的Lexeme栈
Stack<QuickSortSet.Cell> conflictStack = new Stack<QuickSortSet.Cell>();
QuickSortSet.Cell c = lexemeCell;
// 迭代遍历Lexeme链表
while (c != null && c.getLexeme() != null) {
if (!option.addNotCrossLexeme(c.getLexeme())) {
// 词元交叉添加失败则加入lexemeStack栈
conflictStack.push(c);
}
c = c.getNext();
}
return conflictStack;
}
/**
* 回滚词元链直到它能够接受指定的词元
*
* @param option
* @param l
*/
private void backPath(Lexeme l, LexemePath option) {
while (option.checkCross(l)) {
option.removeTail();
}
}
}

View File

@ -0,0 +1,158 @@
/**
* IK 中文分词 版本 5.0 IK Analyzer release 5.0
*
* <p>Licensed to the Apache Software Foundation (ASF) under one or more contributor license
* agreements. See the NOTICE file distributed with this work for additional information regarding
* copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License. You may obtain a
* copy of the License at
*
* <p>http://www.apache.org/licenses/LICENSE-2.0
*
* <p>Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*
* <p>源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012乌龙茶工作室 provided by Linliangyi and copyright 2012
* by Oolong studio
*/
package com.rymcu.forest.lucene.core;
import com.rymcu.forest.lucene.cfg.Configuration;
import com.rymcu.forest.lucene.cfg.DefaultConfig;
import com.rymcu.forest.lucene.dic.Dictionary;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
/** IK分词器主类 */
public final class IKSegmenter {
/** 字符窜reader */
private Reader input;
/** 分词器配置项 */
private Configuration cfg;
/** 分词器上下文 */
private AnalyzeContext context;
/** 分词处理器列表 */
private List<ISegmenter> segmenters;
/** 分词歧义裁决器 */
private IKArbitrator arbitrator;
/**
* IK分词器构造函数
*
* @param input
* @param useSmart 为true使用智能分词策略
* <p>非智能分词细粒度输出所有可能的切分结果 智能分词 合并数词和量词对分词结果进行歧义判断
*/
public IKSegmenter(Reader input, boolean useSmart) {
this.input = input;
this.cfg = DefaultConfig.getInstance();
this.cfg.setUseSmart(useSmart);
this.init();
}
/**
* IK分词器构造函数
*
* @param input
* @param cfg 使用自定义的Configuration构造分词器
*/
public IKSegmenter(Reader input, Configuration cfg) {
this.input = input;
this.cfg = cfg;
this.init();
}
/** 初始化 */
private void init() {
// 初始化词典单例
Dictionary.initial(this.cfg);
// 初始化分词上下文
this.context = new AnalyzeContext(this.cfg);
// 加载子分词器
this.segmenters = this.loadSegmenters();
// 加载歧义裁决器
this.arbitrator = new IKArbitrator();
}
/**
* 初始化词典加载子分词器实现
*
* @return List<ISegmenter>
*/
private List<ISegmenter> loadSegmenters() {
List<ISegmenter> segmenters = new ArrayList<ISegmenter>(4);
// 处理字母的子分词器
segmenters.add(new LetterSegmenter());
// 处理中文数量词的子分词器
segmenters.add(new CN_QuantifierSegmenter());
// 处理中文词的子分词器
segmenters.add(new CJKSegmenter());
return segmenters;
}
/**
* 分词获取下一个词元
*
* @return Lexeme 词元对象
* @throws IOException
*/
public synchronized Lexeme next() throws IOException {
Lexeme l = null;
while ((l = context.getNextLexeme()) == null) {
/*
* 从reader中读取数据填充buffer 如果reader是分次读入buffer的那么buffer要 进行移位处理 移位处理上次读入的但未处理的数据
*/
int available = context.fillBuffer(this.input);
if (available <= 0) {
// reader已经读完
context.reset();
return null;
} else {
// 初始化指针
context.initCursor();
do {
// 遍历子分词器
for (ISegmenter segmenter : segmenters) {
segmenter.analyze(context);
}
// 字符缓冲区接近读完需要读入新的字符
if (context.needRefillBuffer()) {
break;
}
// 向前移动指针
} while (context.moveCursor());
// 重置子分词器为下轮循环进行初始化
for (ISegmenter segmenter : segmenters) {
segmenter.reset();
}
}
// 对分词进行歧义处理
this.arbitrator.process(context, this.cfg.useSmart());
// 将分词结果输出到结果集并处理未切分的单个CJK字符
context.outputToResult();
// 记录本次分词的缓冲区位移
context.markBufferOffset();
}
return l;
}
/**
* 重置分词器到初始状态
*
* @param input
*/
public synchronized void reset(Reader input) {
this.input = input;
context.reset();
for (ISegmenter segmenter : segmenters) {
segmenter.reset();
}
}
}

View File

@ -0,0 +1,44 @@
/**
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package com.rymcu.forest.lucene.core;
/**
*
* 子分词器接口
*/
interface ISegmenter {
/**
* 从分析器读取下一个可能分解的词元对象
* @param context 分词算法上下文
*/
void analyze(AnalyzeContext context);
/**
* 重置子分析器状态
*/
void reset();
}

View File

@ -0,0 +1,283 @@
/**
* IK 中文分词 版本 5.0 IK Analyzer release 5.0
*
* <p>Licensed to the Apache Software Foundation (ASF) under one or more contributor license
* agreements. See the NOTICE file distributed with this work for additional information regarding
* copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License. You may obtain a
* copy of the License at
*
* <p>http://www.apache.org/licenses/LICENSE-2.0
*
* <p>Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*
* <p>源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012乌龙茶工作室 provided by Linliangyi and copyright 2012
* by Oolong studio
*/
package com.rymcu.forest.lucene.core;
import java.util.Arrays;
/** 英文字符及阿拉伯数字子分词器 */
class LetterSegmenter implements ISegmenter {
/** 子分词器标签 */
static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
/** 链接符号 */
private static final char[] Letter_Connector = new char[] {'#', '&', '+', '-', '.', '@', '_'};
/** 数字符号 */
private static final char[] Num_Connector = new char[] {',', '.'};
/** 词元的开始位置, 同时作为子分词器状态标识 当start > -1 时,标识当前的分词器正在处理字符 */
private int start;
/** 记录词元结束位置 end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置 */
private int end;
/** 字母起始位置 */
private int englishStart;
/** 字母结束位置 */
private int englishEnd;
/** 阿拉伯数字起始位置 */
private int arabicStart;
/** 阿拉伯数字结束位置 */
private int arabicEnd;
LetterSegmenter() {
Arrays.sort(Letter_Connector);
Arrays.sort(Num_Connector);
this.start = -1;
this.end = -1;
this.englishStart = -1;
this.englishEnd = -1;
this.arabicStart = -1;
this.arabicEnd = -1;
}
@Override
public void analyze(AnalyzeContext context) {
boolean bufferLockFlag = false;
// 处理英文字母
bufferLockFlag = this.processEnglishLetter(context) || bufferLockFlag;
// 处理阿拉伯字母
bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag;
// 处理混合字母(这个要放最后处理可以通过QuickSortSet排除重复)
bufferLockFlag = this.processMixLetter(context) || bufferLockFlag;
// 判断是否锁定缓冲区
if (bufferLockFlag) {
context.lockBuffer(SEGMENTER_NAME);
} else {
// 对缓冲区解锁
context.unlockBuffer(SEGMENTER_NAME);
}
}
@Override
public void reset() {
this.start = -1;
this.end = -1;
this.englishStart = -1;
this.englishEnd = -1;
this.arabicStart = -1;
this.arabicEnd = -1;
}
/**
* 处理数字字母混合输出 windos2000 | linliangyi2005@gmail.com
*
* @param context
* @return
*/
private boolean processMixLetter(AnalyzeContext context) {
boolean needLock = false;
if (this.start == -1) { // 当前的分词器尚未开始处理字符
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
// 记录起始指针的位置,标明分词器进入处理状态
this.start = context.getCursor();
this.end = start;
}
} else { // 当前的分词器正在处理字符
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
// 记录下可能的结束位置
this.end = context.getCursor();
} else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
&& this.isLetterConnector(context.getCurrentChar())) {
// 记录下可能的结束位置
this.end = context.getCursor();
} else {
// 遇到非Letter字符输出词元
Lexeme newLexeme =
new Lexeme(
context.getBufferOffset(),
this.start,
this.end - this.start + 1,
Lexeme.TYPE_LETTER);
context.addLexeme(newLexeme);
this.start = -1;
this.end = -1;
}
}
// 判断缓冲区是否已经读完
if (context.isBufferConsumed()) {
if (this.start != -1 && this.end != -1) {
// 缓冲以读完输出词元
Lexeme newLexeme =
new Lexeme(
context.getBufferOffset(),
this.start,
this.end - this.start + 1,
Lexeme.TYPE_LETTER);
context.addLexeme(newLexeme);
this.start = -1;
this.end = -1;
}
}
// 判断是否锁定缓冲区
// 对缓冲区解锁
needLock = this.start != -1 || this.end != -1;
return needLock;
}
/**
* 处理纯英文字母输出
*
* @param context
* @return
*/
private boolean processEnglishLetter(AnalyzeContext context) {
boolean needLock = false;
if (this.englishStart == -1) { // 当前的分词器尚未开始处理英文字符
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
// 记录起始指针的位置,标明分词器进入处理状态
this.englishStart = context.getCursor();
this.englishEnd = this.englishStart;
}
} else { // 当前的分词器正在处理英文字符
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
// 记录当前指针位置为结束位置
this.englishEnd = context.getCursor();
} else {
// 遇到非English字符,输出词元
Lexeme newLexeme =
new Lexeme(
context.getBufferOffset(),
this.englishStart,
this.englishEnd - this.englishStart + 1,
Lexeme.TYPE_ENGLISH);
context.addLexeme(newLexeme);
this.englishStart = -1;
this.englishEnd = -1;
}
}
// 判断缓冲区是否已经读完
if (context.isBufferConsumed()) {
if (this.englishStart != -1 && this.englishEnd != -1) {
// 缓冲以读完输出词元
Lexeme newLexeme =
new Lexeme(
context.getBufferOffset(),
this.englishStart,
this.englishEnd - this.englishStart + 1,
Lexeme.TYPE_ENGLISH);
context.addLexeme(newLexeme);
this.englishStart = -1;
this.englishEnd = -1;
}
}
// 判断是否锁定缓冲区
// 对缓冲区解锁
needLock = this.englishStart != -1 || this.englishEnd != -1;
return needLock;
}
/**
* 处理阿拉伯数字输出
*
* @param context
* @return
*/
private boolean processArabicLetter(AnalyzeContext context) {
boolean needLock = false;
if (this.arabicStart == -1) { // 当前的分词器尚未开始处理数字字符
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
// 记录起始指针的位置,标明分词器进入处理状态
this.arabicStart = context.getCursor();
this.arabicEnd = this.arabicStart;
}
} else { // 当前的分词器正在处理数字字符
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
// 记录当前指针位置为结束位置
this.arabicEnd = context.getCursor();
} else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
&& this.isNumConnector(context.getCurrentChar())) {
// 不输出数字但不标记结束
} else {
// //遇到非Arabic字符,输出词元
Lexeme newLexeme =
new Lexeme(
context.getBufferOffset(),
this.arabicStart,
this.arabicEnd - this.arabicStart + 1,
Lexeme.TYPE_ARABIC);
context.addLexeme(newLexeme);
this.arabicStart = -1;
this.arabicEnd = -1;
}
}
// 判断缓冲区是否已经读完
if (context.isBufferConsumed()) {
if (this.arabicStart != -1 && this.arabicEnd != -1) {
// 生成已切分的词元
Lexeme newLexeme =
new Lexeme(
context.getBufferOffset(),
this.arabicStart,
this.arabicEnd - this.arabicStart + 1,
Lexeme.TYPE_ARABIC);
context.addLexeme(newLexeme);
this.arabicStart = -1;
this.arabicEnd = -1;
}
}
// 判断是否锁定缓冲区
// 对缓冲区解锁
needLock = this.arabicStart != -1 || this.arabicEnd != -1;
return needLock;
}
/**
* 判断是否是字母连接符号
*
* @param input
* @return
*/
private boolean isLetterConnector(char input) {
int index = Arrays.binarySearch(Letter_Connector, input);
return index >= 0;
}
/**
* 判断是否是数字连接符号
*
* @param input
* @return
*/
private boolean isNumConnector(char input) {
int index = Arrays.binarySearch(Num_Connector, input);
return index >= 0;
}
}

View File

@ -0,0 +1,279 @@
/**
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package com.rymcu.forest.lucene.core;
/**
* IK词元对象
*/
public class Lexeme implements Comparable<Lexeme> {
// lexemeType常量
// 未知
public static final int TYPE_UNKNOWN = 0;
// 英文
public static final int TYPE_ENGLISH = 1;
// 数字
public static final int TYPE_ARABIC = 2;
// 英文数字混合
public static final int TYPE_LETTER = 3;
// 中文词元
public static final int TYPE_CNWORD = 4;
// 中文单字
public static final int TYPE_CNCHAR = 64;
// 日韩文字
public static final int TYPE_OTHER_CJK = 8;
// 中文数词
public static final int TYPE_CNUM = 16;
// 中文量词
public static final int TYPE_COUNT = 32;
// 中文数量词
public static final int TYPE_CQUAN = 48;
// 词元的起始位移
private int offset;
// 词元的相对起始位置
private int begin;
// 词元的长度
private int length;
// 词元文本
private String lexemeText;
// 词元类型
private int lexemeType;
public Lexeme(int offset, int begin, int length, int lexemeType) {
this.offset = offset;
this.begin = begin;
if (length < 0) {
throw new IllegalArgumentException("length < 0");
}
this.length = length;
this.lexemeType = lexemeType;
}
/*
* 判断词元相等算法 起始位置偏移起始位置终止位置相同
* @see java.lang.Object#equals(Object o)
*/
public boolean equals(Object o) {
if (o == null) {
return false;
}
if (this == o) {
return true;
}
if (o instanceof Lexeme) {
Lexeme other = (Lexeme) o;
if (this.offset == other.getOffset() && this.begin == other.getBegin()
&& this.length == other.getLength()) {
return true;
} else {
return false;
}
} else {
return false;
}
}
/*
* 词元哈希编码算法
* @see java.lang.Object#hashCode()
*/
public int hashCode() {
int absBegin = getBeginPosition();
int absEnd = getEndPosition();
return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
}
/*
* 词元在排序集合中的比较算法
* @see java.lang.Comparable#compareTo(java.lang.Object)
*/
public int compareTo(Lexeme other) {
// 起始位置优先
if (this.begin < other.getBegin()) {
return -1;
} else if (this.begin == other.getBegin()) {
// 词元长度优先
if (this.length > other.getLength()) {
return -1;
} else if (this.length == other.getLength()) {
return 0;
} else {// this.length < other.getLength()
return 1;
}
} else {// this.begin > other.getBegin()
return 1;
}
}
public int getOffset() {
return offset;
}
public void setOffset(int offset) {
this.offset = offset;
}
public int getBegin() {
return begin;
}
/**
* 获取词元在文本中的起始位置
* @return int
*/
public int getBeginPosition() {
return offset + begin;
}
public void setBegin(int begin) {
this.begin = begin;
}
/**
* 获取词元在文本中的结束位置
* @return int
*/
public int getEndPosition() {
return offset + begin + length;
}
/**
* 获取词元的字符长度
* @return int
*/
public int getLength() {
return this.length;
}
public void setLength(int length) {
if (this.length < 0) {
throw new IllegalArgumentException("length < 0");
}
this.length = length;
}
/**
* 获取词元的文本内容
* @return String
*/
public String getLexemeText() {
if (lexemeText == null) {
return "";
}
return lexemeText;
}
public void setLexemeText(String lexemeText) {
if (lexemeText == null) {
this.lexemeText = "";
this.length = 0;
} else {
this.lexemeText = lexemeText;
this.length = lexemeText.length();
}
}
/**
* 获取词元类型
* @return int
*/
public int getLexemeType() {
return lexemeType;
}
/**
* 获取词元类型标示字符串
* @return String
*/
public String getLexemeTypeString() {
switch (lexemeType) {
case TYPE_ENGLISH:
return "ENGLISH";
case TYPE_ARABIC:
return "ARABIC";
case TYPE_LETTER:
return "LETTER";
case TYPE_CNWORD:
return "CN_WORD";
case TYPE_CNCHAR:
return "CN_CHAR";
case TYPE_OTHER_CJK:
return "OTHER_CJK";
case TYPE_COUNT:
return "COUNT";
case TYPE_CNUM:
return "TYPE_CNUM";
case TYPE_CQUAN:
return "TYPE_CQUAN";
default:
return "UNKONW";
}
}
public void setLexemeType(int lexemeType) {
this.lexemeType = lexemeType;
}
/**
* 合并两个相邻的词元
* @param l
* @param lexemeType
* @return boolean 词元是否成功合并
*/
public boolean append(Lexeme l, int lexemeType) {
if (l != null && this.getEndPosition() == l.getBeginPosition()) {
this.length += l.getLength();
this.lexemeType = lexemeType;
return true;
} else {
return false;
}
}
/**
*
*/
public String toString() {
StringBuffer strbuf = new StringBuffer();
strbuf.append(this.getBeginPosition()).append("-").append(this.getEndPosition());
strbuf.append(" : ").append(this.lexemeText).append(" : \t");
strbuf.append(this.getLexemeTypeString());
return strbuf.toString();
}
}

View File

@ -0,0 +1,255 @@
/**
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package com.rymcu.forest.lucene.core;
/**
* Lexeme链路径
*/
class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
// 起始位置
private int pathBegin;
// 结束
private int pathEnd;
// 词元链的有效字符长度
private int payloadLength;
LexemePath() {
this.pathBegin = -1;
this.pathEnd = -1;
this.payloadLength = 0;
}
/**
* 向LexemePath追加相交的Lexeme
* @param lexeme
* @return
*/
boolean addCrossLexeme(Lexeme lexeme) {
if (this.isEmpty()) {
this.addLexeme(lexeme);
this.pathBegin = lexeme.getBegin();
this.pathEnd = lexeme.getBegin() + lexeme.getLength();
this.payloadLength += lexeme.getLength();
return true;
} else if (this.checkCross(lexeme)) {
this.addLexeme(lexeme);
if (lexeme.getBegin() + lexeme.getLength() > this.pathEnd) {
this.pathEnd = lexeme.getBegin() + lexeme.getLength();
}
this.payloadLength = this.pathEnd - this.pathBegin;
return true;
} else {
return false;
}
}
/**
* 向LexemePath追加不相交的Lexeme
* @param lexeme
* @return
*/
boolean addNotCrossLexeme(Lexeme lexeme) {
if (this.isEmpty()) {
this.addLexeme(lexeme);
this.pathBegin = lexeme.getBegin();
this.pathEnd = lexeme.getBegin() + lexeme.getLength();
this.payloadLength += lexeme.getLength();
return true;
} else if (this.checkCross(lexeme)) {
return false;
} else {
this.addLexeme(lexeme);
this.payloadLength += lexeme.getLength();
Lexeme head = this.peekFirst();
this.pathBegin = head.getBegin();
Lexeme tail = this.peekLast();
this.pathEnd = tail.getBegin() + tail.getLength();
return true;
}
}
/**
* 移除尾部的Lexeme
* @return
*/
Lexeme removeTail() {
Lexeme tail = this.pollLast();
if (this.isEmpty()) {
this.pathBegin = -1;
this.pathEnd = -1;
this.payloadLength = 0;
} else {
this.payloadLength -= tail.getLength();
Lexeme newTail = this.peekLast();
this.pathEnd = newTail.getBegin() + newTail.getLength();
}
return tail;
}
/**
* 检测词元位置交叉有歧义的切分
* @param lexeme
* @return
*/
boolean checkCross(Lexeme lexeme) {
return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd)
|| (this.pathBegin >= lexeme.getBegin() && this.pathBegin < lexeme.getBegin()
+ lexeme.getLength());
}
int getPathBegin() {
return pathBegin;
}
int getPathEnd() {
return pathEnd;
}
/**
* 获取Path的有效词长
* @return
*/
int getPayloadLength() {
return this.payloadLength;
}
/**
* 获取LexemePath的路径长度
* @return
*/
int getPathLength() {
return this.pathEnd - this.pathBegin;
}
/**
* X权重词元长度积
* @return
*/
int getXWeight() {
int product = 1;
Cell c = this.getHead();
while (c != null && c.getLexeme() != null) {
product *= c.getLexeme().getLength();
c = c.getNext();
}
return product;
}
/**
* 词元位置权重
* @return
*/
int getPWeight() {
int pWeight = 0;
int p = 0;
Cell c = this.getHead();
while (c != null && c.getLexeme() != null) {
p++;
pWeight += p * c.getLexeme().getLength();
c = c.getNext();
}
return pWeight;
}
LexemePath copy() {
LexemePath theCopy = new LexemePath();
theCopy.pathBegin = this.pathBegin;
theCopy.pathEnd = this.pathEnd;
theCopy.payloadLength = this.payloadLength;
Cell c = this.getHead();
while (c != null && c.getLexeme() != null) {
theCopy.addLexeme(c.getLexeme());
c = c.getNext();
}
return theCopy;
}
public int compareTo(LexemePath o) {
// 比较有效文本长度
if (this.payloadLength > o.payloadLength) {
return -1;
} else if (this.payloadLength < o.payloadLength) {
return 1;
} else {
// 比较词元个数越少越好
if (this.size() < o.size()) {
return -1;
} else if (this.size() > o.size()) {
return 1;
} else {
// 路径跨度越大越好
if (this.getPathLength() > o.getPathLength()) {
return -1;
} else if (this.getPathLength() < o.getPathLength()) {
return 1;
} else {
// 根据统计学结论逆向切分概率高于正向切分因此位置越靠后的优先
if (this.pathEnd > o.pathEnd) {
return -1;
} else if (pathEnd < o.pathEnd) {
return 1;
} else {
// 词长越平均越好
if (this.getXWeight() > o.getXWeight()) {
return -1;
} else if (this.getXWeight() < o.getXWeight()) {
return 1;
} else {
// 词元位置权重比较
if (this.getPWeight() > o.getPWeight()) {
return -1;
} else if (this.getPWeight() < o.getPWeight()) {
return 1;
}
}
}
}
}
}
return 0;
}
public String toString() {
StringBuffer sb = new StringBuffer();
sb.append("pathBegin : ").append(pathBegin).append("\r\n");
sb.append("pathEnd : ").append(pathEnd).append("\r\n");
sb.append("payloadLength : ").append(payloadLength).append("\r\n");
Cell head = this.getHead();
while (head != null) {
sb.append("lexeme : ").append(head.getLexeme()).append("\r\n");
head = head.getNext();
}
return sb.toString();
}
}

View File

@ -0,0 +1,239 @@
/**
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package com.rymcu.forest.lucene.core;
/**
* IK分词器专用的Lexem快速排序集合
*/
class QuickSortSet {
// 链表头
private Cell head;
// 链表尾
private Cell tail;
// 链表的实际大小
private int size;
QuickSortSet() {
this.size = 0;
}
/**
* 向链表集合添加词元
* @param lexeme
*/
boolean addLexeme(Lexeme lexeme) {
Cell newCell = new Cell(lexeme);
if (this.size == 0) {
this.head = newCell;
this.tail = newCell;
this.size++;
return true;
} else {
if (this.tail.compareTo(newCell) == 0) {// 词元与尾部词元相同不放入集合
return false;
} else if (this.tail.compareTo(newCell) < 0) {// 词元接入链表尾部
this.tail.next = newCell;
newCell.prev = this.tail;
this.tail = newCell;
this.size++;
return true;
} else if (this.head.compareTo(newCell) > 0) {// 词元接入链表头部
this.head.prev = newCell;
newCell.next = this.head;
this.head = newCell;
this.size++;
return true;
} else {
// 从尾部上逆
Cell index = this.tail;
while (index != null && index.compareTo(newCell) > 0) {
index = index.prev;
}
if (index.compareTo(newCell) == 0) {// 词元与集合中的词元重复不放入集合
return false;
} else if (index.compareTo(newCell) < 0) {// 词元插入链表中的某个位置
newCell.prev = index;
newCell.next = index.next;
index.next.prev = newCell;
index.next = newCell;
this.size++;
return true;
}
}
}
return false;
}
/**
* 返回链表头部元素
* @return
*/
Lexeme peekFirst() {
if (this.head != null) {
return this.head.lexeme;
}
return null;
}
/**
* 取出链表集合的第一个元素
* @return Lexeme
*/
Lexeme pollFirst() {
if (this.size == 1) {
Lexeme first = this.head.lexeme;
this.head = null;
this.tail = null;
this.size--;
return first;
} else if (this.size > 1) {
Lexeme first = this.head.lexeme;
this.head = this.head.next;
this.size--;
return first;
} else {
return null;
}
}
/**
* 返回链表尾部元素
* @return
*/
Lexeme peekLast() {
if (this.tail != null) {
return this.tail.lexeme;
}
return null;
}
/**
* 取出链表集合的最后一个元素
* @return Lexeme
*/
Lexeme pollLast() {
if (this.size == 1) {
Lexeme last = this.head.lexeme;
this.head = null;
this.tail = null;
this.size--;
return last;
} else if (this.size > 1) {
Lexeme last = this.tail.lexeme;
this.tail = this.tail.prev;
this.size--;
return last;
} else {
return null;
}
}
/**
* 返回集合大小
* @return
*/
int size() {
return this.size;
}
/**
* 判断集合是否为空
* @return
*/
boolean isEmpty() {
return this.size == 0;
}
/**
* 返回lexeme链的头部
* @return
*/
Cell getHead() {
return this.head;
}
/**
*
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* QuickSortSet集合单元
*
*/
class Cell implements Comparable<Cell> {
private Cell prev;
private Cell next;
private Lexeme lexeme;
Cell(Lexeme lexeme) {
if (lexeme == null) {
throw new IllegalArgumentException("lexeme must not be null");
}
this.lexeme = lexeme;
}
public int compareTo(Cell o) {
return this.lexeme.compareTo(o.lexeme);
}
public Cell getPrev() {
return this.prev;
}
public Cell getNext() {
return this.next;
}
public Lexeme getLexeme() {
return this.lexeme;
}
}
}

View File

@ -0,0 +1,328 @@
/**
*
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package com.rymcu.forest.lucene.dic;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
/**
* 词典树分段表示词典树的一个分枝
*/
class DictSegment implements Comparable<DictSegment> {
// 公用字典表存储汉字
private static final Map<Character, Character> charMap = new HashMap<Character, Character>(16,
0.95f);
// 数组大小上限
private static final int ARRAY_LENGTH_LIMIT = 3;
// Map存储结构
private Map<Character, DictSegment> childrenMap;
// 数组方式存储结构
private DictSegment[] childrenArray;
// 当前节点上存储的字符
private Character nodeChar;
// 当前节点存储的Segment数目
// storeSize <=ARRAY_LENGTH_LIMIT 使用数组存储 storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
private int storeSize = 0;
// 当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
private int nodeState = 0;
DictSegment(Character nodeChar) {
if (nodeChar == null) {
throw new IllegalArgumentException("参数为空异常,字符不能为空");
}
this.nodeChar = nodeChar;
}
Character getNodeChar() {
return nodeChar;
}
/*
* 判断是否有下一个节点
*/
boolean hasNextNode() {
return this.storeSize > 0;
}
/**
* 匹配词段
* @param charArray
* @return Hit
*/
Hit match(char[] charArray) {
return this.match(charArray, 0, charArray.length, null);
}
/**
* 匹配词段
* @param charArray
* @param begin
* @param length
* @return Hit
*/
Hit match(char[] charArray, int begin, int length) {
return this.match(charArray, begin, length, null);
}
/**
* 匹配词段
* @param charArray
* @param begin
* @param length
* @param searchHit
* @return Hit
*/
Hit match(char[] charArray, int begin, int length, Hit searchHit) {
if (searchHit == null) {
// 如果hit为空新建
searchHit = new Hit();
// 设置hit的其实文本位置
searchHit.setBegin(begin);
} else {
// 否则要将HIT状态重置
searchHit.setUnmatch();
}
// 设置hit的当前处理位置
searchHit.setEnd(begin);
Character keyChar = new Character(charArray[begin]);
DictSegment ds = null;
// 引用实例变量为本地变量避免查询时遇到更新的同步问题
DictSegment[] segmentArray = this.childrenArray;
Map<Character, DictSegment> segmentMap = this.childrenMap;
// STEP1 在节点中查找keyChar对应的DictSegment
if (segmentArray != null) {
// 在数组中查找
DictSegment keySegment = new DictSegment(keyChar);
int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment);
if (position >= 0) {
ds = segmentArray[position];
}
} else if (segmentMap != null) {
// 在map中查找
ds = (DictSegment) segmentMap.get(keyChar);
}
// STEP2 找到DictSegment判断词的匹配状态是否继续递归还是返回结果
if (ds != null) {
if (length > 1) {
// 词未匹配完继续往下搜索
return ds.match(charArray, begin + 1, length - 1, searchHit);
} else if (length == 1) {
// 搜索最后一个char
if (ds.nodeState == 1) {
// 添加HIT状态为完全匹配
searchHit.setMatch();
}
if (ds.hasNextNode()) {
// 添加HIT状态为前缀匹配
searchHit.setPrefix();
// 记录当前位置的DictSegment
searchHit.setMatchedDictSegment(ds);
}
return searchHit;
}
}
// STEP3 没有找到DictSegment 将HIT设置为不匹配
return searchHit;
}
/**
* 加载填充词典片段
* @param charArray
*/
void fillSegment(char[] charArray) {
this.fillSegment(charArray, 0, charArray.length, 1);
}
/**
* 屏蔽词典中的一个词
* @param charArray
*/
void disableSegment(char[] charArray) {
this.fillSegment(charArray, 0, charArray.length, 0);
}
/**
* 加载填充词典片段
* @param charArray
* @param begin
* @param length
* @param enabled
*/
private synchronized void fillSegment(char[] charArray, int begin, int length, int enabled) {
// 获取字典表中的汉字对象
Character beginChar = new Character(charArray[begin]);
Character keyChar = charMap.get(beginChar);
// 字典中没有该字则将其添加入字典
if (keyChar == null) {
charMap.put(beginChar, beginChar);
keyChar = beginChar;
}
// 搜索当前节点的存储查询对应keyChar的keyChar如果没有则创建
DictSegment ds = lookforSegment(keyChar, enabled);
if (ds != null) {
// 处理keyChar对应的segment
if (length > 1) {
// 词元还没有完全加入词典树
ds.fillSegment(charArray, begin + 1, length - 1, enabled);
} else if (length == 1) {
// 已经是词元的最后一个char,设置当前节点状态为enabled
// enabled=1表明一个完整的词enabled=0表示从词典中屏蔽当前词
ds.nodeState = enabled;
}
}
}
/**
* 查找本节点下对应的keyChar的segment *
* @param keyChar
* @param create =1如果没有找到则创建新的segment ; =0如果没有找到不创建返回null
* @return
*/
private DictSegment lookforSegment(Character keyChar, int create) {
DictSegment ds = null;
if (this.storeSize <= ARRAY_LENGTH_LIMIT) {
// 获取数组容器如果数组未创建则创建数组
DictSegment[] segmentArray = getChildrenArray();
// 搜寻数组
DictSegment keySegment = new DictSegment(keyChar);
int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment);
if (position >= 0) {
ds = segmentArray[position];
}
// 遍历数组后没有找到对应的segment
if (ds == null && create == 1) {
ds = keySegment;
if (this.storeSize < ARRAY_LENGTH_LIMIT) {
// 数组容量未满使用数组存储
segmentArray[this.storeSize] = ds;
// segment数目+1
this.storeSize++;
Arrays.sort(segmentArray, 0, this.storeSize);
} else {
// 数组容量已满切换Map存储
// 获取Map容器如果Map未创建,则创建Map
Map<Character, DictSegment> segmentMap = getChildrenMap();
// 将数组中的segment迁移到Map中
migrate(segmentArray, segmentMap);
// 存储新的segment
segmentMap.put(keyChar, ds);
// segment数目+1 必须在释放数组前执行storeSize++ 确保极端情况下不会取到空的数组
this.storeSize++;
// 释放当前的数组引用
this.childrenArray = null;
}
}
} else {
// 获取Map容器如果Map未创建,则创建Map
Map<Character, DictSegment> segmentMap = getChildrenMap();
// 搜索Map
ds = (DictSegment) segmentMap.get(keyChar);
if (ds == null && create == 1) {
// 构造新的segment
ds = new DictSegment(keyChar);
segmentMap.put(keyChar, ds);
// 当前节点存储segment数目+1
this.storeSize++;
}
}
return ds;
}
/**
* 获取数组容器
* 线程同步方法
*/
private DictSegment[] getChildrenArray() {
if (this.childrenArray == null) {
synchronized (this) {
if (this.childrenArray == null) {
this.childrenArray = new DictSegment[ARRAY_LENGTH_LIMIT];
}
}
}
return this.childrenArray;
}
/**
* 获取Map容器
* 线程同步方法
*/
private Map<Character, DictSegment> getChildrenMap() {
if (this.childrenMap == null) {
synchronized (this) {
if (this.childrenMap == null) {
this.childrenMap = new HashMap<Character, DictSegment>(ARRAY_LENGTH_LIMIT * 2, 0.8f);
}
}
}
return this.childrenMap;
}
/**
* 将数组中的segment迁移到Map中
* @param segmentArray
*/
private void migrate(DictSegment[] segmentArray, Map<Character, DictSegment> segmentMap) {
for (DictSegment segment : segmentArray) {
if (segment != null) {
segmentMap.put(segment.nodeChar, segment);
}
}
}
/**
* 实现Comparable接口
* @param o
* @return int
*/
@Override
public int compareTo(DictSegment o) {
// 对当前节点存储的char进行比较
return this.nodeChar.compareTo(o.nodeChar);
}
}

View File

@ -0,0 +1,347 @@
/**
* IK 中文分词 版本 5.0 IK Analyzer release 5.0
*
* <p>Licensed to the Apache Software Foundation (ASF) under one or more contributor license
* agreements. See the NOTICE file distributed with this work for additional information regarding
* copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License. You may obtain a
* copy of the License at
*
* <p>http://www.apache.org/licenses/LICENSE-2.0
*
* <p>Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*
* <p>源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012乌龙茶工作室 provided by Linliangyi and copyright 2012
* by Oolong studio
*/
package com.rymcu.forest.lucene.dic;
import com.rymcu.forest.lucene.cfg.Configuration;
import org.springframework.core.io.ClassPathResource;
import org.springframework.core.io.Resource;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.List;
/** 词典管理类,单例模式 */
public class Dictionary {
/** 词典单例 */
private static Dictionary singleton;
/** 主词典对象 */
private DictSegment _MainDict;
/** 停止词词典 */
private DictSegment _StopWordDict;
/** 量词词典 */
private DictSegment _QuantifierDict;
/** 用户自定义词典路径 */
private static final String PATH_USER_DIC =
System.getProperty("user.dir") + "/lucene/userDic/userDic.dic";
/** 配置对象 */
private final Configuration cfg;
private Dictionary(Configuration cfg) {
this.cfg = cfg;
this.loadMainDict();
this.loadStopWordDict();
this.loadQuantifierDict();
}
/**
* 词典初始化 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化 只有当Dictionary类被实际调用时才会开始载入词典 这将延长首次分词操作的时间
* 该方法提供了一个在应用加载阶段就初始化字典的手段
*/
public static void initial(Configuration cfg) {
if (singleton == null) {
synchronized (Dictionary.class) {
if (singleton == null) {
singleton = new Dictionary(cfg);
}
}
}
}
/**
* 获取词典单子实例
*
* @return Dictionary 单例对象
*/
public static Dictionary getSingleton() {
if (singleton == null) {
throw new IllegalStateException("词典尚未初始化请先调用initial方法");
}
return singleton;
}
/**
* 批量加载新词条
*
* @param words Collection<String>词条列表
*/
public void addWords(Collection<String> words) {
if (words != null) {
for (String word : words) {
if (word != null) {
// 批量加载词条到主内存词典中
singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
}
}
}
}
/**
* 批量移除屏蔽词条
*
* @param words
*/
public void disableWords(Collection<String> words) {
if (words != null) {
for (String word : words) {
if (word != null) {
// 批量屏蔽词条
singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
}
}
}
}
/**
* 检索匹配主词典
*
* @param charArray
* @return Hit 匹配结果描述
*/
public Hit matchInMainDict(char[] charArray) {
return singleton._MainDict.match(charArray);
}
/**
* 检索匹配主词典
*
* @param charArray
* @param begin
* @param length
* @return Hit 匹配结果描述
*/
public Hit matchInMainDict(char[] charArray, int begin, int length) {
return singleton._MainDict.match(charArray, begin, length);
}
/**
* 检索匹配量词词典
*
* @param charArray
* @param begin
* @param length
* @return Hit 匹配结果描述
*/
public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {
return singleton._QuantifierDict.match(charArray, begin, length);
}
/**
* 从已匹配的Hit中直接取出DictSegment继续向下匹配
*
* @param charArray
* @param currentIndex
* @param matchedHit
* @return Hit
*/
public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {
DictSegment ds = matchedHit.getMatchedDictSegment();
return ds.match(charArray, currentIndex, 1, matchedHit);
}
/**
* 判断是否是停止词
*
* @param charArray
* @param begin
* @param length
* @return boolean
*/
public boolean isStopWord(char[] charArray, int begin, int length) {
return singleton._StopWordDict.match(charArray, begin, length).isMatch();
}
/** 加载主词典及扩展词典 */
private void loadMainDict() {
// 建立一个主词典实例
_MainDict = new DictSegment((char) 0);
// 读取主词典文件
Resource resource = new ClassPathResource(cfg.getMainDictionary());
try {
InputStream is = resource.getInputStream();
BufferedReader br =
new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException e) {
System.err.println("Main Dictionary loading exception.");
e.printStackTrace();
}
// 加载扩展词典
this.loadExtDict();
}
/** 加载用户配置的扩展词典到主词库表 */
private void loadExtDict() {
// 加载扩展词典配置
List<String> extDictFiles = cfg.getExtDictionary();
if (extDictFiles != null) {
InputStream is;
for (String extDictName : extDictFiles) {
// 读取扩展词典文件
System.out.println("加载扩展词典:" + extDictName);
is = this.getClass().getClassLoader().getResourceAsStream(extDictName);
// 如果找不到扩展的字典则忽略
if (is == null) {
try {
is = new FileInputStream(extDictName);
} catch (FileNotFoundException e) {
continue;
}
}
try {
BufferedReader br =
new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
// 加载扩展词典数据到主内存词典中
System.out.println(theWord);
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Extension Dictionary loading exception.");
ioe.printStackTrace();
} finally {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
/** 加载用户扩展的停止词词典 */
private void loadStopWordDict() {
// 建立一个主词典实例
_StopWordDict = new DictSegment((char) 0);
// 加载扩展停止词典
List<String> extStopWordDictFiles = cfg.getExtStopWordDictionary();
if (extStopWordDictFiles != null) {
InputStream is = null;
for (String extStopWordDictName : extStopWordDictFiles) {
System.out.println("加载扩展停止词典:" + extStopWordDictName);
// 读取扩展词典文件
is = this.getClass().getClassLoader().getResourceAsStream(extStopWordDictName);
// 如果找不到扩展的字典则忽略
if (is == null) {
continue;
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
// 加载扩展停止词典数据到内存中
_StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Extension Stop word Dictionary loading exception.");
ioe.printStackTrace();
} finally {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
/** 加载量词词典 */
private void loadQuantifierDict() {
// 建立一个量词典实例
_QuantifierDict = new DictSegment((char) 0);
// 读取量词词典文件
InputStream is =
this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDictionary());
if (is == null) {
throw new RuntimeException("Quantifier Dictionary not found!!!");
}
try {
BufferedReader br =
new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Quantifier Dictionary loading exception.");
ioe.printStackTrace();
} finally {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/** 加载用户配置的自定义扩展词典到主词库表 */
public void updateUserDict() {
// 加载扩展词典配置
InputStream is;
// 读取扩展词典文件
System.out.println("更新加载扩展词典:" + PATH_USER_DIC);
try {
is = new FileInputStream(PATH_USER_DIC);
} catch (FileNotFoundException e) {
return;
}
try {
BufferedReader br =
new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
// 加载扩展词典数据到主内存词典中
System.out.println(theWord);
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Extension Dictionary loading exception.");
ioe.printStackTrace();
} finally {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}

View File

@ -0,0 +1,119 @@
/**
*
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package com.rymcu.forest.lucene.dic;
/**
* 表示一次词典匹配的命中
*/
public class Hit {
// Hit不匹配
private static final int UNMATCH = 0x00000000;
// Hit完全匹配
private static final int MATCH = 0x00000001;
// Hit前缀匹配
private static final int PREFIX = 0x00000010;
// 该HIT当前状态默认未匹配
private int hitState = UNMATCH;
// 记录词典匹配过程中当前匹配到的词典分支节点
private DictSegment matchedDictSegment;
/*
* 词段开始位置
*/
private int begin;
/*
* 词段的结束位置
*/
private int end;
/**
* 判断是否完全匹配
*/
public boolean isMatch() {
return (this.hitState & MATCH) > 0;
}
/**
*
*/
public void setMatch() {
this.hitState = this.hitState | MATCH;
}
/**
* 判断是否是词的前缀
*/
public boolean isPrefix() {
return (this.hitState & PREFIX) > 0;
}
/**
*
*/
public void setPrefix() {
this.hitState = this.hitState | PREFIX;
}
/**
* 判断是否是不匹配
*/
public boolean isUnmatch() {
return this.hitState == UNMATCH;
}
/**
*
*/
public void setUnmatch() {
this.hitState = UNMATCH;
}
public DictSegment getMatchedDictSegment() {
return matchedDictSegment;
}
public void setMatchedDictSegment(DictSegment matchedDictSegment) {
this.matchedDictSegment = matchedDictSegment;
}
public int getBegin() {
return begin;
}
public void setBegin(int begin) {
this.begin = begin;
}
public int getEnd() {
return end;
}
public void setEnd(int end) {
this.end = end;
}
}

View File

@ -0,0 +1,47 @@
package com.rymcu.forest.lucene.lucene;
import com.rymcu.forest.lucene.model.ArticleLucene;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import java.util.List;
import java.util.concurrent.CountDownLatch;
/**
* ArticleBeanIndex
*
* @author suwen
* @date 2021/2/2 14:10
*/
public class ArticleBeanIndex extends BaseIndex<ArticleLucene> {
public ArticleBeanIndex(
String parentIndexPath,
int subIndex,
CountDownLatch countDownLatch1,
CountDownLatch countDownLatch2,
List<ArticleLucene> list) {
super(parentIndexPath, subIndex, countDownLatch1, countDownLatch2, list);
}
@Override
public void indexDoc(IndexWriter writer, ArticleLucene t) throws Exception {
Document doc = new Document();
Field id = new Field("id", t.getIdArticle() + "", TextField.TYPE_STORED);
Field title = new Field("title", t.getArticleTitle(), TextField.TYPE_STORED);
Field summary = new Field("summary", t.getArticleContent(), TextField.TYPE_STORED);
// 添加到Document中
doc.add(id);
doc.add(title);
doc.add(summary);
if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
writer.addDocument(doc);
} else {
writer.updateDocument(new Term("id", t.getIdArticle() + ""), doc);
}
}
}

View File

@ -0,0 +1,140 @@
package com.rymcu.forest.lucene.lucene;
import com.rymcu.forest.lucene.util.IndexUtil;
import org.apache.lucene.index.IndexWriter;
import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.util.List;
import java.util.concurrent.CountDownLatch;
/**
* BaseIndex
*
* @author suwen
* @date 2021/2/2 14:14
*/
public abstract class BaseIndex<T> implements Runnable {
/** 父级索引路径 */
private String parentIndexPath;
/** 索引编写器 */
private IndexWriter writer;
private int subIndex;
/** 主线程 */
private final CountDownLatch countDownLatch1;
/** 工作线程 */
private final CountDownLatch countDownLatch2;
/** 对象列表 */
private List<T> list;
public BaseIndex(String parentIndexPath, int subIndex) {
this.parentIndexPath = parentIndexPath;
this.subIndex = subIndex;
try {
this.writer = IndexUtil.getIndexWriter(parentIndexPath + "/index" + subIndex, true);
} catch (IOException e) {
e.printStackTrace();
}
this.countDownLatch1 = null;
this.countDownLatch2 = null;
}
public BaseIndex(
IndexWriter writer,
CountDownLatch countDownLatch1,
CountDownLatch countDownLatch2,
List<T> list) {
super();
this.writer = writer;
this.countDownLatch1 = countDownLatch1;
this.countDownLatch2 = countDownLatch2;
this.list = list;
}
public BaseIndex(
String parentIndexPath,
int subIndex,
CountDownLatch countDownLatch1,
CountDownLatch countDownLatch2,
List<T> list) {
super();
this.parentIndexPath = parentIndexPath;
this.subIndex = subIndex;
try {
// 多目录索引创建
File file = new File(parentIndexPath + "/index" + subIndex);
if (!file.exists()) {
file.mkdir();
}
this.writer = IndexUtil.getIndexWriter(parentIndexPath + "/index" + subIndex, true);
} catch (IOException e) {
e.printStackTrace();
}
this.subIndex = subIndex;
this.countDownLatch1 = countDownLatch1;
this.countDownLatch2 = countDownLatch2;
this.list = list;
}
public BaseIndex(
String path, CountDownLatch countDownLatch1, CountDownLatch countDownLatch2, List<T> list) {
super();
try {
// 单目录索引创建
File file = new File(path);
if (!file.exists()) {
file.mkdir();
}
this.writer = IndexUtil.getIndexWriter(path, true);
} catch (IOException e) {
e.printStackTrace();
}
;
this.countDownLatch1 = countDownLatch1;
this.countDownLatch2 = countDownLatch2;
this.list = list;
}
/**
* 创建索引
*
* @param writer
* @throws IOException
* @throws ParseException
*/
public abstract void indexDoc(IndexWriter writer, T t) throws Exception;
/**
* 批量索引创建
*
* @param writer
* @param t
* @throws Exception
*/
public void indexDocs(IndexWriter writer, List<T> t) throws Exception {
for (T t2 : t) {
indexDoc(writer, t2);
}
}
@Override
public void run() {
try {
countDownLatch1.await();
System.out.println(writer);
indexDocs(writer, list);
} catch (Exception e) {
e.printStackTrace();
} finally {
countDownLatch2.countDown();
try {
writer.commit();
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}

View File

@ -0,0 +1,75 @@
/**
* IK 中文分词 版本 5.0.1
* IK Analyzer release 5.0.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package com.rymcu.forest.lucene.lucene;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
/**
* IK分词器Lucene Analyzer接口实现
*/
public final class IKAnalyzer extends Analyzer {
private boolean useSmart;
public boolean useSmart() {
return useSmart;
}
public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
/**
* IK分词器Lucene Analyzer接口实现类
*
* 默认细粒度切分算法
*/
public IKAnalyzer() {
this(false);
}
/**
* IK分词器Lucene Analyzer接口实现类
*
* @param useSmart 当为true时分词器进行智能切分
*/
public IKAnalyzer(boolean useSmart) {
super();
this.useSmart = useSmart;
}
/**
* lucene 6.0
* 重载Analyzer接口构造分词组件
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer IKTokenizer = new IKTokenizer(this.useSmart());
return new Analyzer.TokenStreamComponents(IKTokenizer);
}
}

View File

@ -0,0 +1,121 @@
/**
* IK 中文分词 版本 5.0.1
* IK Analyzer release 5.0.1
* <p>
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* <p>
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*/
package com.rymcu.forest.lucene.lucene;
import com.rymcu.forest.lucene.core.IKSegmenter;
import com.rymcu.forest.lucene.core.Lexeme;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeFactory;
import java.io.IOException;
/**
* IK分词器 Lucene Tokenizer适配器类
* 兼容Lucene 6.0版本
*/
public final class IKTokenizer extends Tokenizer {
// IK分词器实现
private IKSegmenter _IKImplement;
// 词元文本属性
private final CharTermAttribute termAtt;
// 词元位移属性
private final OffsetAttribute offsetAtt;
// 词元分类属性该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量
private final TypeAttribute typeAtt;
// 记录最后一个词元的结束位置
private int endPosition;
private Tokenizer delegate;
/**
* Lucene 6.0 Tokenizer适配器类构造函数
* @param useSmart
*/
public IKTokenizer(boolean useSmart) {
super();
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
//传入 IKSegmenter input Reader 会被 父类 Tokenizer 类的无参构造器
//初始化为 this.input = ILLEGAL_STATE_READER;
_IKImplement = new IKSegmenter(input, useSmart);
}
/**
* lucene 6.0 新增
* 方便创建 工厂类
* @param factory
* @param useSmart
*/
public IKTokenizer(AttributeFactory factory, boolean useSmart) {
super(factory);
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
_IKImplement = new IKSegmenter(input, useSmart);
}
@Override
public boolean incrementToken() throws IOException {
// 清除所有的词元属性
clearAttributes();
Lexeme nextLexeme = _IKImplement.next();
if (nextLexeme != null) {
// 将Lexeme转成Attributes
// 设置词元文本
termAtt.append(nextLexeme.getLexemeText());
// 设置词元长度
termAtt.setLength(nextLexeme.getLength());
// 设置词元位移
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
// 记录分词的最后位置
endPosition = nextLexeme.getEndPosition();
// 记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString());
// 返会true告知还有下个词元
return true;
}
// 返会false告知词元输出完毕
return false;
}
@Override
public void reset() throws IOException {
super.reset();
_IKImplement.reset(input);
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset, finalOffset);
}
}

View File

@ -0,0 +1,48 @@
package com.rymcu.forest.lucene.lucene;
import com.rymcu.forest.lucene.model.PortfolioLucene;
import com.rymcu.forest.lucene.model.UserLucene;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import java.util.List;
import java.util.concurrent.CountDownLatch;
/**
* PortfolioBeanIndex
*
* @author suwen
* @date 2021/4/17 14:10
*/
public class PortfolioBeanIndex extends BaseIndex<PortfolioLucene> {
public PortfolioBeanIndex(
String parentIndexPath,
int subIndex,
CountDownLatch countDownLatch1,
CountDownLatch countDownLatch2,
List<PortfolioLucene> list) {
super(parentIndexPath, subIndex, countDownLatch1, countDownLatch2, list);
}
@Override
public void indexDoc(IndexWriter writer, PortfolioLucene user) throws Exception {
Document doc = new Document();
Field id = new Field("id", user.getIdPortfolio() + "", TextField.TYPE_STORED);
Field title = new Field("title", user.getPortfolioTitle(), TextField.TYPE_STORED);
Field summary = new Field("summary", user.getPortfolioDescription(), TextField.TYPE_STORED);
// 添加到Document中
doc.add(id);
doc.add(title);
doc.add(summary);
if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
writer.addDocument(doc);
} else {
writer.updateDocument(new Term("id", user.getIdPortfolio() + ""), doc);
}
}
}

View File

@ -0,0 +1,48 @@
package com.rymcu.forest.lucene.lucene;
import com.rymcu.forest.lucene.model.UserLucene;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.CountDownLatch;
/**
* UserBeanIndex
*
* @author suwen
* @date 2021/2/2 14:10
*/
public class UserBeanIndex extends BaseIndex<UserLucene> {
public UserBeanIndex(
String parentIndexPath,
int subIndex,
CountDownLatch countDownLatch1,
CountDownLatch countDownLatch2,
List<UserLucene> list) {
super(parentIndexPath, subIndex, countDownLatch1, countDownLatch2, list);
}
@Override
public void indexDoc(IndexWriter writer, UserLucene user) throws Exception {
Document doc = new Document();
Field id = new Field("id", user.getIdUser() + "", TextField.TYPE_STORED);
Field title = new Field("nickname", user.getNickname(), TextField.TYPE_STORED);
Field summary = new Field("signature", Objects.nonNull(user.getSignature()) ? user.getSignature() : "", TextField.TYPE_STORED);
// 添加到Document中
doc.add(id);
doc.add(title);
doc.add(summary);
if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
writer.addDocument(doc);
} else {
writer.updateDocument(new Term("id", user.getIdUser() + ""), doc);
}
}
}

View File

@ -0,0 +1,43 @@
package com.rymcu.forest.lucene.mapper;
import com.rymcu.forest.dto.ArticleDTO;
import com.rymcu.forest.lucene.model.ArticleLucene;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Param;
import java.util.List;
/**
* ArticleLuceneMapper
*
* @author suwen
* @date 2021/2/3 10:00
*/
@Mapper
public interface ArticleLuceneMapper {
/**
* 加载所有文章内容
*
* @return
*/
List<ArticleLucene> getAllArticleLucene();
/**
* 加载所有文章内容
*
* @param ids 文章id(半角逗号分隔)
* @return
*/
List<ArticleDTO> getArticlesByIds(@Param("ids") String[] ids);
/**
* 加载文章内容
*
* @param id 文章id
* @return
*/
ArticleLucene getById(@Param("id") String id);
}

View File

@ -0,0 +1,44 @@
package com.rymcu.forest.lucene.mapper;
import com.rymcu.forest.dto.PortfolioDTO;
import com.rymcu.forest.dto.UserDTO;
import com.rymcu.forest.entity.Portfolio;
import com.rymcu.forest.lucene.model.PortfolioLucene;
import com.rymcu.forest.lucene.model.UserLucene;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Param;
import java.util.List;
/**
* PortfolioLuceneMapper
*
* @author suwen
* @date 2021/4/17 10:00
*/
@Mapper
public interface PortfolioLuceneMapper {
/**
* 加载所有作品集信息
*
* @return
*/
List<PortfolioLucene> getAllPortfolioLucene();
/**
* 加载所有作品集信息
*
* @param ids 作品集id(半角逗号分隔)
* @return
*/
List<PortfolioDTO> getPortfoliosByIds(@Param("ids") String[] ids);
/**
* 加载作品集
*
* @param id 用户id
* @return
*/
PortfolioLucene getById(@Param("id") String id);
}

View File

@ -0,0 +1,53 @@
package com.rymcu.forest.lucene.mapper;
import com.rymcu.forest.lucene.model.UserDic;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Param;
import java.util.List;
/**
* UserDicMapper
*
* @author suwen
* @date 2021/2/4 09:11
*/
@Mapper
public interface UserDicMapper {
/**
* 加载所有字典
*
* @return
*/
List<String> getAllDic();
/**
* 加载所有字典信息
*
* @return
*/
List<UserDic> getAll();
/**
* 增加字典
*
* @return
*/
void addDic(@Param("dic") String userDic);
/**
* 删除字典
*
* @param id
*/
void deleteDic(@Param("id") String id);
/**
* 更新字典
*
* @param id
* @param userDic
*/
void updateDic(@Param("id") Integer id, @Param("dic") String userDic);
}

View File

@ -0,0 +1,41 @@
package com.rymcu.forest.lucene.mapper;
import com.rymcu.forest.dto.UserDTO;
import com.rymcu.forest.lucene.model.UserLucene;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Param;
import java.util.List;
/**
* UserLuceneMapper
*
* @author suwen
* @date 2021/3/6 10:00
*/
@Mapper
public interface UserLuceneMapper {
/**
* 加载所有用户信息
*
* @return
*/
List<UserLucene> getAllUserLucene();
/**
* 加载所有用户信息
*
* @param ids 用户id(半角逗号分隔)
* @return
*/
List<UserDTO> getUsersByIds(@Param("ids") Integer[] ids);
/**
* 加载 UserLucene
*
* @param id 用户id
* @return
*/
UserLucene getById(@Param("id") String id);
}

View File

@ -0,0 +1,33 @@
package com.rymcu.forest.lucene.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import javax.persistence.Id;
/**
* ArticleLucene
*
* @author suwen
* @date 2021/2/3 09:57
*/
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class ArticleLucene {
/** 文章编号 */
private String idArticle;
/** 文章标题 */
private String articleTitle;
/** 文章内容 */
private String articleContent;
/** 相关度评分 */
private String score;
}

View File

@ -0,0 +1,31 @@
package com.rymcu.forest.lucene.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
/**
* PortfolioLucene
*
* @author suwen
* @date 2021/3/6 09:57
*/
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class PortfolioLucene {
/** 作品集编号 */
private String idPortfolio;
/** 作品集名称 */
private String portfolioTitle;
/** 作品集介绍 */
private String portfolioDescription;
/** 相关度评分 */
private String score;
}

View File

@ -0,0 +1,25 @@
package com.rymcu.forest.lucene.model;
import lombok.Data;
import javax.persistence.GeneratedValue;
import javax.persistence.Id;
import javax.persistence.Table;
/**
* UserDic 用户个性化字典
*
* @author suwen
* @date 2021/2/4 09:09
*/
@Data
@Table(name = "forest_lucene_user_dic")
public class UserDic {
/** 主键 */
@Id
@GeneratedValue(generator = "JDBC")
private Integer id;
/** 字典 */
private String dic;
}

View File

@ -0,0 +1,35 @@
package com.rymcu.forest.lucene.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.ibatis.type.JdbcType;
import tk.mybatis.mapper.annotation.ColumnType;
import javax.persistence.Column;
/**
* UserLucene
*
* @author suwen
* @date 2021/3/6 09:57
*/
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class UserLucene {
/** 用户编号 */
private Integer idUser;
/** 昵称 */
private String nickname;
/** 签名 */
private String signature;
/** 相关度评分 */
private String score;
}

View File

@ -0,0 +1,75 @@
package com.rymcu.forest.lucene.service;
import com.rymcu.forest.dto.ArticleDTO;
import com.rymcu.forest.lucene.model.ArticleLucene;
import java.util.List;
/**
* LuceneService
*
* @author suwen
* @date 2021/2/3 10:10
*/
public interface LuceneService {
/**
* 将文章的数据解析为一个个关键字词存储到索引文件中
*
* @param list
*/
void writeArticle(List<ArticleLucene> list);
/**
* 写入单个文章索引
*
* @param id
*/
void writeArticle(String id);
/**
* 写入单个文章索引
*
* @param articleLucene
*/
void writeArticle(ArticleLucene articleLucene);
/**
* 更新单个文章索引
*
* @param id
*/
void updateArticle(String id);
/**
* 删除单个文章索引
*
* @param id
*/
void deleteArticle(String id);
/**
* 关键词搜索
*
* @param value
* @return
* @throws Exception
*/
List<ArticleLucene> searchArticle(String value);
/**
* 加载所有文章内容
*
* @return
*/
List<ArticleLucene> getAllArticleLucene();
/**
* 加载所有文章内容
*
* @param ids 文章id(半角逗号分隔)
* @return
*/
List<ArticleDTO> getArticlesByIds(String[] ids);
}

View File

@ -0,0 +1,74 @@
package com.rymcu.forest.lucene.service;
import com.rymcu.forest.dto.PortfolioDTO;
import com.rymcu.forest.lucene.model.PortfolioLucene;
import java.util.List;
/**
* PortfolioLuceneService
*
* @author suwen
* @date 2021/4/17 10:10
*/
public interface PortfolioLuceneService {
/**
* 批量写入作品集信息到索引
*
* @param list
*/
void writePortfolio(List<PortfolioLucene> list);
/**
* 写入单个作品集索引
*
* @param id
*/
void writePortfolio(String id);
/**
* 写入单个作品集索引
*
* @param portfolioLucene
*/
void writePortfolio(PortfolioLucene portfolioLucene);
/**
* 更新单个作品集索引
*
* @param id
*/
void updatePortfolio(String id);
/**
* 删除单个作品集索引
*
* @param id
*/
void deletePortfolio(String id);
/**
* 关键词搜索
*
* @param value
* @return
* @throws Exception
*/
List<PortfolioLucene> searchPortfolio(String value);
/**
* 加载所有作品集内容
*
* @return
*/
List<PortfolioLucene> getAllPortfolioLucene();
/**
* 加载所有作品集内容
*
* @param ids 作品集id(半角逗号分隔)
* @return
*/
List<PortfolioDTO> getPortfoliosByIds(String[] ids);
}

View File

@ -0,0 +1,56 @@
package com.rymcu.forest.lucene.service;
import com.rymcu.forest.lucene.model.UserDic;
import java.io.FileNotFoundException;
import java.util.List;
/**
* UserDicService
*
* @author suwen
* @date 2021/2/4 09:25
*/
public interface UserDicService {
/**
* 加载所有字典
*
* @return
*/
List<String> getAllDic();
/**
* 加载所有字典
*
* @return
*/
List<UserDic> getAll();
/**
* 增加字典
*
* @return
*/
void addDic(String dic);
/**
* 删除字典
*
* @param id
*/
void deleteDic(String id);
/**
* 更新字典
*
* @param userDic
*/
void updateDic(UserDic userDic);
/**
* 写入字典至内存
*
*/
void writeUserDic() throws FileNotFoundException;
}

View File

@ -0,0 +1,74 @@
package com.rymcu.forest.lucene.service;
import com.rymcu.forest.dto.UserDTO;
import com.rymcu.forest.lucene.model.UserLucene;
import java.util.List;
/**
* UserLuceneService
*
* @author suwen
* @date 2021/3/5 10:10
*/
public interface UserLuceneService {
/**
* 批量写入用户信息到索引
*
* @param list
*/
void writeUser(List<UserLucene> list);
/**
* 写入单个用户索引
*
* @param id
*/
void writeUser(String id);
/**
* 写入单个用户索引
*
* @param UserLucene
*/
void writeUser(UserLucene UserLucene);
/**
* 更新单个用户索引
*
* @param id
*/
void updateUser(String id);
/**
* 删除单个用户索引
*
* @param id
*/
void deleteUser(String id);
/**
* 关键词搜索
*
* @param value
* @return
* @throws Exception
*/
List<UserLucene> searchUser(String value);
/**
* 加载所有用户内容
*
* @return
*/
List<UserLucene> getAllUserLucene();
/**
* 加载所有用户内容
*
* @param ids 用户id(半角逗号分隔)
* @return
*/
List<UserDTO> getUsersByIds(Integer[] ids);
}

View File

@ -0,0 +1,197 @@
package com.rymcu.forest.lucene.service.impl;
import com.rymcu.forest.dto.ArticleDTO;
import com.rymcu.forest.lucene.lucene.ArticleBeanIndex;
import com.rymcu.forest.lucene.lucene.IKAnalyzer;
import com.rymcu.forest.lucene.mapper.ArticleLuceneMapper;
import com.rymcu.forest.lucene.model.ArticleLucene;
import com.rymcu.forest.lucene.service.LuceneService;
import com.rymcu.forest.lucene.util.ArticleIndexUtil;
import com.rymcu.forest.lucene.util.LucenePath;
import com.rymcu.forest.lucene.util.SearchUtil;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.*;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* LuceneServiceImpl
*
* @author suwen
* @date 2021/2/3 10:29
*/
@Service
public class LuceneServiceImpl implements LuceneService {
@Resource private ArticleLuceneMapper luceneMapper;
/**
* 将文章的数据解析为一个个关键字词存储到索引文件中
*
* @param list
*/
@Override
public void writeArticle(List<ArticleLucene> list) {
try {
int totalCount = list.size();
int perThreadCount = 3000;
int threadCount = totalCount / perThreadCount + (totalCount % perThreadCount == 0 ? 0 : 1);
ExecutorService pool = Executors.newFixedThreadPool(threadCount);
CountDownLatch countDownLatch1 = new CountDownLatch(1);
CountDownLatch countDownLatch2 = new CountDownLatch(threadCount);
for (int i = 0; i < threadCount; i++) {
int start = i * perThreadCount;
int end = Math.min((i + 1) * perThreadCount, totalCount);
List<ArticleLucene> subList = list.subList(start, end);
Runnable runnable =
new ArticleBeanIndex(
LucenePath.ARTICLE_INDEX_PATH, i, countDownLatch1, countDownLatch2, subList);
// 子线程交给线程池管理
pool.execute(runnable);
}
countDownLatch1.countDown();
System.out.println("开始创建索引");
// 等待所有线程都完成
countDownLatch2.await();
// 线程全部完成工作
System.out.println("所有线程都创建索引完毕");
// 释放线程池资源
pool.shutdown();
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public void writeArticle(String id) {
writeArticle(luceneMapper.getById(id));
}
@Override
public void writeArticle(ArticleLucene articleLucene) {
ArticleIndexUtil.addIndex(articleLucene);
}
@Override
public void updateArticle(String id) {
ArticleIndexUtil.updateIndex(luceneMapper.getById(id));
}
@Override
public void deleteArticle(String id) {
ArticleIndexUtil.deleteIndex(id);
}
/**
* 关键词搜索
*
* @param value
* @return
* @throws Exception
*/
@Override
public List<ArticleLucene> searchArticle(String value) {
List<ArticleLucene> resList = new ArrayList<>();
ExecutorService service = Executors.newCachedThreadPool();
// 定义分词器
Analyzer analyzer = new IKAnalyzer();
try {
IndexSearcher searcher =
SearchUtil.getIndexSearcherByParentPath(LucenePath.ARTICLE_INDEX_PATH, service);
String[] fields = {"title", "summary"};
// 构造Query对象
MultiFieldQueryParser parser = new MultiFieldQueryParser(fields, analyzer);
BufferedReader in =
new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
String line = value != null ? value : in.readLine();
Query query = parser.parse(line);
// 最终被分词后添加的前缀和后缀处理器默认是粗体<B></B>
SimpleHTMLFormatter htmlFormatter =
new SimpleHTMLFormatter("<font color=" + "\"" + "red" + "\"" + ">", "</font>");
// 高亮搜索的词添加到高亮处理器中
Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
// 获取搜索的结果指定返回document返回的个数
// TODO 默认搜索结果为显示第一页1000 可以优化
TopDocs results = SearchUtil.getScoreDocsByPerPage(1, 100, searcher, query);
ScoreDoc[] hits = results.scoreDocs;
// 遍历输出
for (ScoreDoc hit : hits) {
int id = hit.doc;
float score = hit.score;
Document hitDoc = searcher.doc(hit.doc);
// 获取到summary
String name = hitDoc.get("summary");
// 将查询的词和搜索词匹配匹配到添加前缀和后缀
TokenStream tokenStream =
TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "summary", analyzer);
// 传入的第二个参数是查询的值
TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, name, false, 10);
StringBuilder baikeValue = new StringBuilder();
for (TextFragment textFragment : frag) {
if ((textFragment != null) && (textFragment.getScore() > 0)) {
// if ((frag[j] != null)) {
// 获取 summary 的值
baikeValue.append(textFragment);
}
}
// 获取到title
String title = hitDoc.get("title");
TokenStream titleTokenStream =
TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "title", analyzer);
TextFragment[] titleFrag =
highlighter.getBestTextFragments(titleTokenStream, title, false, 10);
StringBuilder titleValue = new StringBuilder();
for (int j = 0; j < titleFrag.length; j++) {
if ((frag[j] != null)) {
titleValue.append(titleFrag[j].toString());
}
}
resList.add(
ArticleLucene.builder()
.idArticle(hitDoc.get("id"))
.articleTitle(titleValue.toString())
.articleContent(baikeValue.toString())
.score(String.valueOf(score))
.build());
}
} catch (IOException | ParseException | InvalidTokenOffsetsException e) {
e.printStackTrace();
} finally {
service.shutdownNow();
}
return resList;
}
@Override
public List<ArticleLucene> getAllArticleLucene() {
return luceneMapper.getAllArticleLucene();
}
@Override
public List<ArticleDTO> getArticlesByIds(String[] ids) {
return luceneMapper.getArticlesByIds(ids);
}
}

View File

@ -0,0 +1,190 @@
package com.rymcu.forest.lucene.service.impl;
import com.rymcu.forest.dto.PortfolioDTO;
import com.rymcu.forest.lucene.lucene.IKAnalyzer;
import com.rymcu.forest.lucene.lucene.PortfolioBeanIndex;
import com.rymcu.forest.lucene.mapper.PortfolioLuceneMapper;
import com.rymcu.forest.lucene.model.PortfolioLucene;
import com.rymcu.forest.lucene.service.PortfolioLuceneService;
import com.rymcu.forest.lucene.util.LucenePath;
import com.rymcu.forest.lucene.util.PortfolioIndexUtil;
import com.rymcu.forest.lucene.util.SearchUtil;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.*;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* UserServiceImpl
*
* @author suwen
* @date 2021/3/6 10:29
*/
@Service
public class PortfolioLuceneServiceImpl implements PortfolioLuceneService {
@Resource private PortfolioLuceneMapper portfolioLuceneMapper;
/**
* 将文章的数据解析为一个个关键字词存储到索引文件中
*
* @param list
*/
@Override
public void writePortfolio(List<PortfolioLucene> list) {
try {
int totalCount = list.size();
int perThreadCount = 3000;
int threadCount = totalCount / perThreadCount + (totalCount % perThreadCount == 0 ? 0 : 1);
ExecutorService pool = Executors.newFixedThreadPool(threadCount);
CountDownLatch countDownLatch1 = new CountDownLatch(1);
CountDownLatch countDownLatch2 = new CountDownLatch(threadCount);
for (int i = 0; i < threadCount; i++) {
int start = i * perThreadCount;
int end = Math.min((i + 1) * perThreadCount, totalCount);
List<PortfolioLucene> subList = list.subList(start, end);
Runnable runnable =
new PortfolioBeanIndex(LucenePath.PORTFOLIO_PATH, i, countDownLatch1, countDownLatch2, subList);
// 子线程交给线程池管理
pool.execute(runnable);
}
countDownLatch1.countDown();
System.out.println("开始创建索引");
// 等待所有线程都完成
countDownLatch2.await();
// 线程全部完成工作
System.out.println("所有线程都创建索引完毕");
// 释放线程池资源
pool.shutdown();
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public void writePortfolio(String id) {
writePortfolio(portfolioLuceneMapper.getById(id));
}
@Override
public void writePortfolio(PortfolioLucene portfolioLucene) {
PortfolioIndexUtil.addIndex(portfolioLucene);
}
@Override
public void updatePortfolio(String id) {
PortfolioIndexUtil.updateIndex(portfolioLuceneMapper.getById(id));
}
@Override
public void deletePortfolio(String id) {
PortfolioIndexUtil.deleteIndex(id);
}
@Override
public List<PortfolioLucene> getAllPortfolioLucene() {
return portfolioLuceneMapper.getAllPortfolioLucene();
}
@Override
public List<PortfolioDTO> getPortfoliosByIds(String[] ids) {
return portfolioLuceneMapper.getPortfoliosByIds(ids);
}
@Override
public List<PortfolioLucene> searchPortfolio(String value) {
List<PortfolioLucene> resList = new ArrayList<>();
ExecutorService service = Executors.newCachedThreadPool();
// 定义分词器
Analyzer analyzer = new IKAnalyzer();
try {
IndexSearcher searcher = SearchUtil.getIndexSearcherByParentPath(LucenePath.PORTFOLIO_PATH, service);
String[] fields = {"title", "summary"};
// 构造Query对象
MultiFieldQueryParser parser = new MultiFieldQueryParser(fields, analyzer);
BufferedReader in =
new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
String line = value != null ? value : in.readLine();
Query query = parser.parse(line);
// 最终被分词后添加的前缀和后缀处理器默认是粗体<B></B>
SimpleHTMLFormatter htmlFormatter =
new SimpleHTMLFormatter("<font color=" + "\"" + "red" + "\"" + ">", "</font>");
// 高亮搜索的词添加到高亮处理器中
Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
// 获取搜索的结果指定返回document返回的个数
// TODO 默认搜索结果为显示第一页1000 可以优化
TopDocs results = SearchUtil.getScoreDocsByPerPage(1, 100, searcher, query);
ScoreDoc[] hits = results.scoreDocs;
// 遍历输出
for (ScoreDoc hit : hits) {
int id = hit.doc;
float score = hit.score;
Document hitDoc = searcher.doc(hit.doc);
// 获取到summary
String summary = hitDoc.get("summary");
// 将查询的词和搜索词匹配匹配到添加前缀和后缀
TokenStream tokenStream =
TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "summary", analyzer);
// 传入的第二个参数是查询的值
TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, summary, false, 10);
StringBuilder sb = new StringBuilder();
for (TextFragment textFragment : frag) {
if ((textFragment != null) && (textFragment.getScore() > 0)) {
// if ((frag[j] != null)) {
// 获取 summary 的值
sb.append(textFragment.toString());
}
}
// 获取到title
String title = hitDoc.get("title");
TokenStream titleTokenStream =
TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "title", analyzer);
TextFragment[] titleFrag =
highlighter.getBestTextFragments(titleTokenStream, title, false, 10);
StringBuilder titleValue = new StringBuilder();
for (int j = 0; j < titleFrag.length; j++) {
if ((frag[j] != null)) {
titleValue.append(titleFrag[j].toString());
}
}
resList.add(
PortfolioLucene.builder()
.idPortfolio(hitDoc.get("id"))
.portfolioTitle(titleValue.toString())
.portfolioDescription(sb.toString())
.score(String.valueOf(score))
.build());
}
} catch (IOException | ParseException | InvalidTokenOffsetsException e) {
System.out.println(e.getMessage());
e.printStackTrace();
} finally {
service.shutdownNow();
}
return resList;
}
}

View File

@ -0,0 +1,79 @@
package com.rymcu.forest.lucene.service.impl;
import com.rymcu.forest.lucene.dic.Dictionary;
import com.rymcu.forest.lucene.mapper.UserDicMapper;
import com.rymcu.forest.lucene.model.UserDic;
import com.rymcu.forest.lucene.service.UserDicService;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.List;
/**
* UserDicServiceImpl
*
* @author suwen
* @date 2021/2/4 09:26
*/
@Service
public class UserDicServiceImpl implements UserDicService {
@Resource private UserDicMapper userDicMapper;
@Override
public List<String> getAllDic() {
return userDicMapper.getAllDic();
}
@Override
public List<UserDic> getAll() {
return userDicMapper.getAll();
}
@Override
public void addDic(String dic) {
userDicMapper.addDic(dic);
writeUserDic();
}
@Override
public void deleteDic(String id) {
userDicMapper.deleteDic(id);
writeUserDic();
}
@Override
public void updateDic(UserDic userDic) {
userDicMapper.updateDic(userDic.getId(), userDic.getDic());
writeUserDic();
}
@Override
public void writeUserDic() {
try {
String filePath = "lucene/userDic/";
File file = new File(filePath);
if (!file.exists()) {
file.mkdirs();
}
FileOutputStream stream = new FileOutputStream(file + "/userDic.dic", false);
OutputStreamWriter outfw = new OutputStreamWriter(stream, StandardCharsets.UTF_8);
PrintWriter fw = new PrintWriter(new BufferedWriter(outfw));
userDicMapper
.getAllDic()
.forEach(
each -> {
fw.write(each);
fw.write("\r\n");
});
fw.flush();
fw.close();
Dictionary.getSingleton().updateUserDict();
} catch (IOException e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,189 @@
package com.rymcu.forest.lucene.service.impl;
import com.rymcu.forest.dto.UserDTO;
import com.rymcu.forest.lucene.lucene.UserBeanIndex;
import com.rymcu.forest.lucene.lucene.IKAnalyzer;
import com.rymcu.forest.lucene.mapper.UserLuceneMapper;
import com.rymcu.forest.lucene.model.UserLucene;
import com.rymcu.forest.lucene.service.UserLuceneService;
import com.rymcu.forest.lucene.util.LucenePath;
import com.rymcu.forest.lucene.util.UserIndexUtil;
import com.rymcu.forest.lucene.util.SearchUtil;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.*;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* UserServiceImpl
*
* @author suwen
* @date 2021/3/6 10:29
*/
@Service
public class UserLuceneServiceImpl implements UserLuceneService {
@Resource private UserLuceneMapper userLuceneMapper;
/**
* 将文章的数据解析为一个个关键字词存储到索引文件中
*
* @param list
*/
@Override
public void writeUser(List<UserLucene> list) {
try {
int totalCount = list.size();
int perThreadCount = 3000;
int threadCount = totalCount / perThreadCount + (totalCount % perThreadCount == 0 ? 0 : 1);
ExecutorService pool = Executors.newFixedThreadPool(threadCount);
CountDownLatch countDownLatch1 = new CountDownLatch(1);
CountDownLatch countDownLatch2 = new CountDownLatch(threadCount);
for (int i = 0; i < threadCount; i++) {
int start = i * perThreadCount;
int end = Math.min((i + 1) * perThreadCount, totalCount);
List<UserLucene> subList = list.subList(start, end);
Runnable runnable =
new UserBeanIndex(LucenePath.USER_PATH, i, countDownLatch1, countDownLatch2, subList);
// 子线程交给线程池管理
pool.execute(runnable);
}
countDownLatch1.countDown();
System.out.println("开始创建索引");
// 等待所有线程都完成
countDownLatch2.await();
// 线程全部完成工作
System.out.println("所有线程都创建索引完毕");
// 释放线程池资源
pool.shutdown();
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public void writeUser(String id) {
writeUser(userLuceneMapper.getById(id));
}
@Override
public void writeUser(UserLucene UserLucene) {
UserIndexUtil.addIndex(UserLucene);
}
@Override
public void updateUser(String id) {
UserIndexUtil.updateIndex(userLuceneMapper.getById(id));
}
@Override
public void deleteUser(String id) {
UserIndexUtil.deleteIndex(id);
}
@Override
public List<UserLucene> searchUser(String value) {
List<UserLucene> resList = new ArrayList<>();
ExecutorService service = Executors.newCachedThreadPool();
// 定义分词器
Analyzer analyzer = new IKAnalyzer();
try {
IndexSearcher searcher = SearchUtil.getIndexSearcherByParentPath(LucenePath.USER_PATH, service);
String[] fields = {"nickname", "signature"};
// 构造Query对象
MultiFieldQueryParser parser = new MultiFieldQueryParser(fields, analyzer);
BufferedReader in =
new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
String line = value != null ? value : in.readLine();
Query query = parser.parse(line);
// 最终被分词后添加的前缀和后缀处理器默认是粗体<B></B>
SimpleHTMLFormatter htmlFormatter =
new SimpleHTMLFormatter("<font color=" + "\"" + "red" + "\"" + ">", "</font>");
// 高亮搜索的词添加到高亮处理器中
Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
// 获取搜索的结果指定返回document返回的个数
// TODO 默认搜索结果为显示第一页1000 可以优化
TopDocs results = SearchUtil.getScoreDocsByPerPage(1, 100, searcher, query);
ScoreDoc[] hits = results.scoreDocs;
// 遍历输出
for (ScoreDoc hit : hits) {
int id = hit.doc;
float score = hit.score;
Document hitDoc = searcher.doc(hit.doc);
// 获取到summary
String name = hitDoc.get("signature");
// 将查询的词和搜索词匹配匹配到添加前缀和后缀
TokenStream tokenStream =
TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "signature", analyzer);
// 传入的第二个参数是查询的值
TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, name, false, 10);
StringBuilder baikeValue = new StringBuilder();
for (TextFragment textFragment : frag) {
if ((textFragment != null) && (textFragment.getScore() > 0)) {
// if ((frag[j] != null)) {
// 获取 summary 的值
baikeValue.append(textFragment.toString());
}
}
// 获取到title
String title = hitDoc.get("nickname");
TokenStream titleTokenStream =
TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "nickname", analyzer);
TextFragment[] titleFrag =
highlighter.getBestTextFragments(titleTokenStream, title, false, 10);
StringBuilder titleValue = new StringBuilder();
for (int j = 0; j < titleFrag.length; j++) {
if ((frag[j] != null)) {
titleValue.append(titleFrag[j].toString());
}
}
resList.add(
UserLucene.builder()
.idUser(Integer.valueOf(hitDoc.get("id")))
.nickname(titleValue.toString())
.signature(baikeValue.toString())
.score(String.valueOf(score))
.build());
}
} catch (IOException | ParseException | InvalidTokenOffsetsException e) {
System.out.println(e.getMessage());
e.printStackTrace();
} finally {
service.shutdownNow();
}
return resList;
}
@Override
public List<UserLucene> getAllUserLucene() {
return userLuceneMapper.getAllUserLucene();
}
@Override
public List<UserDTO> getUsersByIds(Integer[] ids) {
return userLuceneMapper.getUsersByIds(ids);
}
}

View File

@ -0,0 +1,84 @@
package com.rymcu.forest.lucene.util;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.StrUtil;
import com.rymcu.forest.lucene.model.ArticleLucene;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import java.io.IOException;
import java.util.Arrays;
/**
* 文章索引更新工具类
*
* @author suwen
*/
public class ArticleIndexUtil {
/** lucene索引保存目录 */
private static final String PATH =
System.getProperty("user.dir") + StrUtil.SLASH + LucenePath.ARTICLE_INDEX_PATH;
/** 删除所有运行中保存的索引 */
public static void deleteAllIndex() {
if (FileUtil.exist(LucenePath.ARTICLE_INCREMENT_INDEX_PATH)) {
FileUtil.del(LucenePath.ARTICLE_INCREMENT_INDEX_PATH);
}
}
public static void addIndex(ArticleLucene t) {
creatIndex(t);
}
public static void updateIndex(ArticleLucene t) {
deleteIndex(t.getIdArticle());
creatIndex(t);
}
/**
* 增加或创建单个索引
*
* @param t
* @throws Exception
*/
private static synchronized void creatIndex(ArticleLucene t) {
System.out.println("创建单个索引");
IndexWriter writer;
try {
writer = IndexUtil.getIndexWriter(LucenePath.ARTICLE_INCREMENT_INDEX_PATH, false);
Document doc = new Document();
doc.add(new StringField("id", t.getIdArticle() + "", Field.Store.YES));
doc.add(new TextField("title", t.getArticleTitle(), Field.Store.YES));
doc.add(new TextField("summary", t.getArticleContent(), Field.Store.YES));
writer.addDocument(doc);
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/** 删除单个索引 */
public static synchronized void deleteIndex(String id) {
Arrays.stream(FileUtil.ls(PATH))
.forEach(
each -> {
if (each.isDirectory()) {
IndexWriter writer;
try {
writer = IndexUtil.getIndexWriter(each.getAbsolutePath(), false);
writer.deleteDocuments(new Term("id", id));
writer.forceMergeDeletes(); // 强制删除
writer.commit();
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
});
}
}

View File

@ -0,0 +1,50 @@
package com.rymcu.forest.lucene.util;
import com.rymcu.forest.lucene.lucene.IKAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogByteSizeMergePolicy;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import java.io.IOException;
import java.nio.file.Paths;
/**
* IndexUtil
*
* @author suwen
* @date 2021/2/2 14:03
*/
public class IndexUtil {
/**
* 创建索引写入器
* @param indexPath
* @param create
* @return
* @throws IOException
*/
public static IndexWriter getIndexWriter(String indexPath,boolean create) throws IOException{
Directory dir = FSDirectory.open(Paths.get(indexPath));
Analyzer analyzer = new IKAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
LogMergePolicy mergePolicy = new LogByteSizeMergePolicy();
//设置segment添加文档(Document)时的合并频率
// 值较小,建立索引的速度就较慢
// 值较大,建立索引的速度就较快,>10适合批量建立索引
mergePolicy.setMergeFactor(50);
//设置segment最大合并文档(Document)
//值较小有利于追加索引的速度
//值较大,适合批量建立索引和更快的搜索
mergePolicy.setMaxMergeDocs(5000);
if (create){
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
}else {
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
}
return new IndexWriter(dir, iwc);
}
}

View File

@ -0,0 +1,33 @@
package com.rymcu.forest.lucene.util;
/**
* LucenePath lucene索引地址常量
*
* @author Suwen
*/
public final class LucenePath {
/** lucene 目录 */
public static final String INDEX_PATH = "lucene/index";
/** 文章 lucene 目录 */
public static final String ARTICLE_INDEX_PATH = INDEX_PATH + "/article";
/** 文章增量 lucene 目录 */
public static final String ARTICLE_INCREMENT_INDEX_PATH =
System.getProperty("user.dir") + ARTICLE_INDEX_PATH + "/index777";
/** 用户 lucene 目录 */
public static final String USER_PATH = INDEX_PATH + "/user";
/** 用户增量 lucene 目录 */
public static final String USER_INCREMENT_INDEX_PATH =
System.getProperty("user.dir") + USER_PATH + "/index777";
/** 作品集 lucene 目录 */
public static final String PORTFOLIO_PATH = INDEX_PATH + "/portfolio";
/** 作品集增量 lucene 目录 */
public static final String PORTFOLIO_INCREMENT_INDEX_PATH =
System.getProperty("user.dir") + PORTFOLIO_PATH + "/index777";
}

View File

@ -0,0 +1,85 @@
package com.rymcu.forest.lucene.util;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.StrUtil;
import com.rymcu.forest.lucene.model.ArticleLucene;
import com.rymcu.forest.lucene.model.PortfolioLucene;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import java.io.IOException;
import java.util.Arrays;
/**
* 作品集索引更新工具类
*
* @author suwen
*/
public class PortfolioIndexUtil {
/** lucene索引保存目录 */
private static final String PATH =
System.getProperty("user.dir") + StrUtil.SLASH + LucenePath.PORTFOLIO_PATH;
/** 删除所有运行中保存的索引 */
public static void deleteAllIndex() {
if (FileUtil.exist(LucenePath.PORTFOLIO_INCREMENT_INDEX_PATH)) {
FileUtil.del(LucenePath.PORTFOLIO_INCREMENT_INDEX_PATH);
}
}
public static void addIndex(PortfolioLucene t) {
creatIndex(t);
}
public static void updateIndex(PortfolioLucene t) {
deleteIndex(t.getIdPortfolio());
creatIndex(t);
}
/**
* 增加或创建单个索引
*
* @param t
* @throws Exception
*/
private static synchronized void creatIndex(PortfolioLucene t) {
System.out.println("创建单个索引");
IndexWriter writer;
try {
writer = IndexUtil.getIndexWriter(LucenePath.PORTFOLIO_INCREMENT_INDEX_PATH, false);
Document doc = new Document();
doc.add(new StringField("id", t.getIdPortfolio() + "", Field.Store.YES));
doc.add(new TextField("title", t.getPortfolioTitle(), Field.Store.YES));
doc.add(new TextField("summary", t.getPortfolioDescription(), Field.Store.YES));
writer.addDocument(doc);
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/** 删除单个索引 */
public static synchronized void deleteIndex(String id) {
Arrays.stream(FileUtil.ls(PATH))
.forEach(
each -> {
if (each.isDirectory()) {
IndexWriter writer;
try {
writer = IndexUtil.getIndexWriter(each.getAbsolutePath(), false);
writer.deleteDocuments(new Term("id", id));
writer.forceMergeDeletes(); // 强制删除
writer.commit();
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
});
}
}

View File

@ -0,0 +1,157 @@
package com.rymcu.forest.lucene.util;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.Set;
import java.util.concurrent.ExecutorService;
/**
* SearchUtil lucene索引查询工具类
*
* @author suwen
* @date 2021/2/2 14:04
*/
public class SearchUtil {
/**
* 获取IndexSearcher对象
*
* @param service
* @return
* @throws IOException
*/
public static IndexSearcher getIndexSearcherByParentPath(
String parentPath, ExecutorService service) {
MultiReader reader = null;
// 设置
try {
File[] files = new File(parentPath).listFiles();
IndexReader[] readers = new IndexReader[files.length];
for (int i = 0; i < files.length; i++) {
readers[i] =
DirectoryReader.open(FSDirectory.open(Paths.get(files[i].getPath())));
}
reader = new MultiReader(readers);
} catch (IOException e) {
e.printStackTrace();
}
return new IndexSearcher(reader, service);
}
/**
* 根据索引路径获取IndexReader
*
* @param indexPath
* @return
* @throws IOException
*/
public static DirectoryReader getIndexReader(String indexPath) throws IOException {
return DirectoryReader.open(FSDirectory.open(Paths.get(indexPath)));
}
/**
* 根据索引路径获取IndexSearcher
*
* @param indexPath
* @param service
* @return
* @throws IOException
*/
public static IndexSearcher getIndexSearcherByIndexPath(String indexPath, ExecutorService service)
throws IOException {
IndexReader reader = getIndexReader(indexPath);
return new IndexSearcher(reader, service);
}
/**
* 如果索引目录会有变更用此方法获取新的IndexSearcher这种方式会占用较少的资源
*
* @param oldSearcher
* @param service
* @return
* @throws IOException
*/
public static IndexSearcher getIndexSearcherOpenIfChanged(
IndexSearcher oldSearcher, ExecutorService service) throws IOException {
DirectoryReader reader = (DirectoryReader) oldSearcher.getIndexReader();
DirectoryReader newReader = DirectoryReader.openIfChanged(reader);
return new IndexSearcher(newReader, service);
}
/**
* 根据IndexSearcher和docID获取默认的document
*
* @param searcher
* @param docID
* @return
* @throws IOException
*/
public static Document getDefaultFullDocument(IndexSearcher searcher, int docID)
throws IOException {
return searcher.doc(docID);
}
/**
* 根据IndexSearcher和docID
*
* @param searcher
* @param docID
* @param listField
* @return
* @throws IOException
*/
public static Document getDocumentByListField(
IndexSearcher searcher, int docID, Set<String> listField) throws IOException {
return searcher.doc(docID, listField);
}
/**
* 分页查询
*
* @param page 当前页数
* @param perPage 每页显示条数
* @param searcher searcher查询器
* @param query 查询条件
* @return
* @throws IOException
*/
public static TopDocs getScoreDocsByPerPage(
int page, int perPage, IndexSearcher searcher, Query query) throws IOException {
TopDocs result = null;
if (query == null) {
System.out.println(" Query is null return null ");
return null;
}
ScoreDoc before = null;
if (page != 1) {
TopDocs docsBefore = searcher.search(query, (page - 1) * perPage);
ScoreDoc[] scoreDocs = docsBefore.scoreDocs;
if (scoreDocs.length > 0) {
before = scoreDocs[scoreDocs.length - 1];
}
}
result = searcher.searchAfter(before, query, perPage);
return result;
}
public static TopDocs getScoreDocs(IndexSearcher searcher, Query query) throws IOException {
TopDocs docs = searcher.search(query, getMaxDocId(searcher));
return docs;
}
/**
* 统计document的数量,此方法等同于matchAllDocsQuery查询
*
* @param searcher
* @return
*/
public static int getMaxDocId(IndexSearcher searcher) {
return searcher.getIndexReader().maxDoc();
}
}

View File

@ -0,0 +1,87 @@
package com.rymcu.forest.lucene.util;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.StrUtil;
import com.rymcu.forest.lucene.model.UserLucene;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import java.io.IOException;
import java.util.Arrays;
/**
* 用户索引更新工具类
*
* @author suwen
*/
public class UserIndexUtil {
/** lucene索引保存目录 */
private static final String PATH = System.getProperty("user.dir") + StrUtil.SLASH + LucenePath.USER_PATH;
/** 系统运行时索引保存目录 */
private static final String INDEX_PATH = LucenePath.USER_INCREMENT_INDEX_PATH;
/** 删除所有运行中保存的索引 */
public static void deleteAllIndex() {
if (FileUtil.exist(INDEX_PATH)) {
FileUtil.del(INDEX_PATH);
}
}
public static void addIndex(UserLucene t) {
creatIndex(t);
}
public static void updateIndex(UserLucene t) {
deleteIndex(t.getIdUser().toString());
creatIndex(t);
}
/**
* 增加或创建单个索引
*
* @param t
* @throws Exception
*/
private static synchronized void creatIndex(UserLucene t) {
System.out.println("创建单个索引");
IndexWriter writer;
try {
writer = IndexUtil.getIndexWriter(INDEX_PATH, false);
Document doc = new Document();
doc.add(new StringField("id", t.getIdUser() + "", Field.Store.YES));
doc.add(new TextField("nickname", t.getNickname(), Field.Store.YES));
doc.add(new TextField("signature", t.getSignature(), Field.Store.YES));
writer.addDocument(doc);
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/** 删除单个索引 */
public static synchronized void deleteIndex(String id) {
Arrays.stream(FileUtil.ls(PATH))
.forEach(
each -> {
if (each.isDirectory()) {
IndexWriter writer;
try {
writer = IndexUtil.getIndexWriter(each.getAbsolutePath(), false);
writer.deleteDocuments(new Term("id", id));
writer.forceMergeDeletes(); // 强制删除
writer.commit();
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
});
}
}

View File

@ -1,28 +0,0 @@
package com.rymcu.forest.mapper;
import com.rymcu.forest.dto.SearchModel;
import java.util.List;
/**
* @author ronger
*/
public interface SearchMapper {
/**
* 初始化文章搜索数据
* @return
*/
List<SearchModel> searchInitialArticleSearch();
/**
* 初始化作品集搜索数据
* @return
*/
List<SearchModel> searchInitialPortfolioSearch();
/**
* 初始化用户搜索数据
* @return
*/
List<SearchModel> searchInitialUserSearch();
}

View File

@ -10,8 +10,8 @@ import org.apache.ibatis.annotations.Param;
public interface UserExtendMapper extends Mapper<UserExtend> {
/**
* 获取用户扩展信息
* @param nickname
* @param account
* @return
*/
UserExtend selectUserExtendByNickname(@Param("nickname") String nickname);
UserExtend selectUserExtendByAccount(@Param("account") String account);
}

View File

@ -1,16 +0,0 @@
package com.rymcu.forest.service;
import com.rymcu.forest.dto.SearchModel;
import java.util.List;
/**
* @author ronger
*/
public interface SearchService {
/**
* 初始化搜索数据
* @return
*/
List<SearchModel> initialSearch();
}

View File

@ -8,6 +8,7 @@ import com.rymcu.forest.entity.Article;
import com.rymcu.forest.entity.ArticleContent;
import com.rymcu.forest.entity.Tag;
import com.rymcu.forest.entity.User;
import com.rymcu.forest.lucene.service.LuceneService;
import com.rymcu.forest.mapper.ArticleMapper;
import com.rymcu.forest.service.ArticleService;
import com.rymcu.forest.service.CommentService;
@ -15,6 +16,8 @@ import com.rymcu.forest.service.TagService;
import com.rymcu.forest.service.UserService;
import com.rymcu.forest.util.*;
import com.rymcu.forest.web.api.exception.BaseApiException;
import lombok.extern.java.Log;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.text.StringEscapeUtils;
import org.springframework.beans.factory.annotation.Value;
@ -31,6 +34,7 @@ import java.util.*;
* @author ronger
*/
@Service
@Slf4j
public class ArticleServiceImpl extends AbstractService<Article> implements ArticleService {
@Resource
@ -41,6 +45,8 @@ public class ArticleServiceImpl extends AbstractService<Article> implements Arti
private UserService userService;
@Resource
private CommentService commentService;
@Resource
private LuceneService luceneService;
@Value("${resource.domain}")
private String domain;
@ -175,7 +181,17 @@ public class ArticleServiceImpl extends AbstractService<Article> implements Arti
}
}
}
// 草稿不更新索引
if ("0".equals(article.getArticleStatus())) {
System.out.println("开始增加索引");
if (isUpdate) {
log.info("更新文章索引id={}",newArticle.getIdArticle());
luceneService.updateArticle(newArticle.getIdArticle().toString());
} else {
log.info("写入文章索引id={}",newArticle.getIdArticle());
luceneService.writeArticle(newArticle.getIdArticle().toString());
}
}
tagService.saveTagArticle(newArticle, articleContentHtml);
if (defaultStatus.equals(newArticle.getArticleStatus())) {
@ -271,6 +287,7 @@ public class ArticleServiceImpl extends AbstractService<Article> implements Arti
deleteLinkedData(id);
// 删除文章
result = articleMapper.deleteByPrimaryKey(id);
luceneService.deleteArticle(id.toString());
if (result < 1) {
map.put("message", "删除失败!");
}

View File

@ -7,6 +7,8 @@ import com.rymcu.forest.dto.*;
import com.rymcu.forest.entity.Article;
import com.rymcu.forest.entity.Portfolio;
import com.rymcu.forest.entity.User;
import com.rymcu.forest.lucene.model.PortfolioLucene;
import com.rymcu.forest.lucene.util.PortfolioIndexUtil;
import com.rymcu.forest.mapper.PortfolioMapper;
import com.rymcu.forest.service.ArticleService;
import com.rymcu.forest.service.PortfolioService;
@ -72,9 +74,21 @@ public class PortfolioServiceImpl extends AbstractService<Portfolio> implements
portfolio.setCreatedTime(new Date());
portfolio.setUpdatedTime(portfolio.getCreatedTime());
portfolioMapper.insertSelective(portfolio);
PortfolioIndexUtil.addIndex(
PortfolioLucene.builder()
.idPortfolio(portfolio.getIdPortfolio().toString())
.portfolioTitle(portfolio.getPortfolioTitle())
.portfolioDescription(portfolio.getPortfolioDescription())
.build());
} else {
portfolio.setUpdatedTime(new Date());
portfolioMapper.updateByPrimaryKeySelective(portfolio);
PortfolioIndexUtil.updateIndex(
PortfolioLucene.builder()
.idPortfolio(portfolio.getIdPortfolio().toString())
.portfolioTitle(portfolio.getPortfolioTitle())
.portfolioDescription(portfolio.getPortfolioDescription())
.build());
}
return portfolio;
}
@ -176,6 +190,8 @@ public class PortfolioServiceImpl extends AbstractService<Portfolio> implements
Integer result = portfolioMapper.deleteByPrimaryKey(idPortfolio);
if (result.equals(0)) {
map.put("message", "操作失败!");
}else {
PortfolioIndexUtil.deleteIndex(String.valueOf(idPortfolio));
}
}

View File

@ -1,43 +0,0 @@
package com.rymcu.forest.service.impl;
import com.rymcu.forest.core.service.redis.RedisResult;
import com.rymcu.forest.core.service.redis.RedisService;
import com.rymcu.forest.dto.SearchModel;
import com.rymcu.forest.mapper.SearchMapper;
import com.rymcu.forest.service.SearchService;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
/**
* @author ronger
*/
@Service
public class SearchServiceImpl implements SearchService {
@Resource
private SearchMapper searchMapper;
@Resource
private RedisService redisService;
@Override
public List<SearchModel> initialSearch() {
String searchKey = "initialSearch";
RedisResult<SearchModel> result = redisService.getListResult(searchKey, SearchModel.class);
if (Objects.nonNull(result.getListResult())) {
return result.getListResult();
}
List<SearchModel> list = new ArrayList<>();
List<SearchModel> articleSearchModels = searchMapper.searchInitialArticleSearch();
List<SearchModel> portfolioSearchModels = searchMapper.searchInitialPortfolioSearch();
List<SearchModel> userSearchModels = searchMapper.searchInitialUserSearch();
list.addAll(articleSearchModels);
list.addAll(portfolioSearchModels);
list.addAll(userSearchModels);
redisService.set(searchKey, list, 24 * 60 * 60);
return list;
}
}

View File

@ -7,6 +7,8 @@ import com.rymcu.forest.entity.Role;
import com.rymcu.forest.entity.User;
import com.rymcu.forest.entity.UserExtend;
import com.rymcu.forest.jwt.service.TokenManager;
import com.rymcu.forest.lucene.model.UserLucene;
import com.rymcu.forest.lucene.util.UserIndexUtil;
import com.rymcu.forest.mapper.RoleMapper;
import com.rymcu.forest.mapper.UserExtendMapper;
import com.rymcu.forest.mapper.UserMapper;
@ -75,6 +77,11 @@ public class UserServiceImpl extends AbstractService<User> implements UserServic
user = userMapper.findByAccount(email);
Role role = roleMapper.selectRoleByInputCode("user");
userMapper.insertUserRole(user.getIdUser(), role.getIdRole());
UserIndexUtil.addIndex(UserLucene.builder()
.idUser(user.getIdUser())
.nickname(user.getNickname())
.signature(user.getSignature())
.build());
map.put("message","注册成功!");
map.put("flag",1);
redisService.delete(email);
@ -194,6 +201,11 @@ public class UserServiceImpl extends AbstractService<User> implements UserServic
}
Integer result = userMapper.updateUserInfo(user.getIdUser(), user.getNickname(), user.getAvatarType(),user.getAvatarUrl(),
user.getEmail(),user.getPhone(),user.getSignature(), user.getSex());
UserIndexUtil.addIndex(UserLucene.builder()
.idUser(user.getIdUser())
.nickname(user.getNickname())
.signature(user.getSignature())
.build());
if (result == 0) {
map.put("message", "操作失败!");
return map;
@ -239,8 +251,8 @@ public class UserServiceImpl extends AbstractService<User> implements UserServic
}
@Override
public UserExtend selectUserExtendByAccount(String nickname) {
return userExtendMapper.selectUserExtendByNickname(nickname);
public UserExtend selectUserExtendByAccount(String account) {
return userExtendMapper.selectUserExtendByAccount(account);
}
@Override

View File

@ -3,6 +3,8 @@ package com.rymcu.forest.util;
import com.github.pagehelper.PageInfo;
import com.rymcu.forest.dto.ArticleDTO;
import com.rymcu.forest.dto.NotificationDTO;
import com.rymcu.forest.dto.PortfolioDTO;
import com.rymcu.forest.dto.UserDTO;
import com.rymcu.forest.entity.Notification;
import com.rymcu.forest.entity.User;
import org.apache.shiro.SecurityUtils;
@ -145,6 +147,28 @@ public class Utils {
return map;
}
public static Map getUserGlobalResult(PageInfo<UserDTO> pageInfo) {
Map map = new HashMap(2);
map.put("users", pageInfo.getList());
Map pagination = new HashMap(4);
pagination.put("pageSize",pageInfo.getPageSize());
pagination.put("total",pageInfo.getTotal());
pagination.put("currentPage",pageInfo.getPageNum());
map.put("pagination", pagination);
return map;
}
public static Map getPortfolioGlobalResult(PageInfo<PortfolioDTO> pageInfo) {
Map map = new HashMap(2);
map.put("portfolios", pageInfo.getList());
Map pagination = new HashMap(4);
pagination.put("pageSize",pageInfo.getPageSize());
pagination.put("total",pageInfo.getTotal());
pagination.put("currentPage",pageInfo.getPageNum());
map.put("pagination", pagination);
return map;
}
public static Map getNotificationsGlobalResult(PageInfo<Notification> pageInfo) {
Map map = new HashMap(2);
map.put("notifications", pageInfo.getList());

View File

@ -7,7 +7,6 @@ import com.rymcu.forest.core.result.GlobalResultGenerator;
import com.rymcu.forest.core.result.GlobalResultMessage;
import com.rymcu.forest.core.service.log.annotation.VisitLogger;
import com.rymcu.forest.dto.*;
import com.rymcu.forest.entity.Portfolio;
import com.rymcu.forest.entity.User;
import com.rymcu.forest.service.*;
import com.rymcu.forest.util.UserUtils;
@ -35,8 +34,6 @@ public class CommonApiController {
private ArticleService articleService;
@Resource
private PortfolioService portfolioService;
@Resource
private SearchService SearchService;
@GetMapping("/get-email-code")
public GlobalResult<Map<String, String>> getEmailCode(@RequestParam("email") String email) throws MessagingException {
@ -145,22 +142,4 @@ public class CommonApiController {
Map map = Utils.getArticlesGlobalResult(pageInfo);
return GlobalResultGenerator.genSuccessResult(map);
}
@GetMapping("/initial-search")
public GlobalResult initialSearch() {
List<SearchModel> list = SearchService.initialSearch();
return GlobalResultGenerator.genSuccessResult(list);
}
@GetMapping("/portfolios")
public GlobalResult portfolios(@RequestParam(defaultValue = "0") Integer page, @RequestParam(defaultValue = "12") Integer rows) {
PageHelper.startPage(page, rows);
List<Portfolio> list = portfolioService.findPortfolios();
PageInfo<Portfolio> pageInfo = new PageInfo(list);
Map map = new HashMap(2);
map.put("portfolios", pageInfo.getList());
Map pagination = Utils.getPagination(pageInfo);
map.put("pagination", pagination);
return GlobalResultGenerator.genSuccessResult(map);
}
}

View File

@ -9,8 +9,8 @@
<result column="qq" property="qq"></result>
<result column="blog" property="blog"></result>
</resultMap>
<select id="selectUserExtendByNickname" resultMap="BaseResultMap">
<select id="selectUserExtendByAccount" resultMap="BaseResultMap">
select vue.* from forest_user_extend vue join forest_user vu on vue.id_user = vu.id
where vu.nickname = #{nickname} limit 1
where vu.account = #{account} limit 1
</select>
</mapper>

View File

@ -0,0 +1,66 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd" >
<mapper namespace="com.rymcu.forest.lucene.mapper.ArticleLuceneMapper">
<resultMap id="BaseResultMap" type="com.rymcu.forest.lucene.model.ArticleLucene">
<id column="id" property="idArticle" jdbcType="INTEGER"/>
<result column="article_title" property="articleTitle" jdbcType="VARCHAR"/>
</resultMap>
<resultMap id="ResultMapWithBLOBs" type="com.rymcu.forest.lucene.model.ArticleLucene" extends="BaseResultMap">
<result column="article_content" property="articleContent" jdbcType="LONGVARCHAR"/>
</resultMap>
<resultMap id="DTOResultMap" type="com.rymcu.forest.dto.ArticleDTO">
<result column="id" property="idArticle"></result>
<result column="article_title" property="articleTitle"></result>
<result column="article_thumbnail_url" property="articleThumbnailUrl"></result>
<result column="article_author_id" property="articleAuthorId"></result>
<result column="nickname" property="articleAuthorName"></result>
<result column="avatar_url" property="articleAuthorAvatarUrl"></result>
<result column="article_type" property="articleType"></result>
<result column="article_tags" property="articleTags"></result>
<result column="article_view_count" property="articleViewCount"></result>
<result column="article_preview_content" property="articlePreviewContent"></result>
<result column="article_content" property="articleContent"></result>
<result column="comment_count" property="articleCommentCount"></result>
<result column="time_ago" property="timeAgo"></result>
<result column="article_permalink" property="articlePermalink"></result>
<result column="article_link" property="articleLink"></result>
<result column="article_status" property="articleStatus"></result>
<result column="updated_time" property="updatedTime"></result>
<result column="sort_no" property="sortNo"></result>
<result column="article_perfect" property="articlePerfect"></result>
<result column="article_thumbs_up_count" property="articleThumbsUpCount"></result>
<result column="article_sponsor_count" property="articleSponsorCount"></result>
</resultMap>
<select id="getAllArticleLucene" resultMap="ResultMapWithBLOBs">
select art.id, art.article_title, content.article_content
from forest_article art
join forest_article_content content on art.id = content.id_article
where article_status = 0;
</select>
<select id="getArticlesByIds" resultMap="DTOResultMap">
select art.*, su.nickname, su.avatar_url
from forest_article art
join forest_user su on art.article_author_id = su.id
where article_status = 0
and art.id in
<foreach collection="ids" item="id" index="index"
open="(" close=")" separator=",">
#{id}
</foreach>
order by
field(art.id
<foreach collection="ids" item="id" index="index"
open="," close=")" separator=",">
#{id}
</foreach>
</select>
<select id="getById" resultMap="ResultMapWithBLOBs">
select art.id, art.article_title, content.article_content
from forest_article art
join forest_article_content content on art.id = content.id_article
where article_status = 0
and id = #{id};
</select>
</mapper>

View File

@ -0,0 +1,44 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd" >
<mapper namespace="com.rymcu.forest.lucene.mapper.PortfolioLuceneMapper">
<resultMap id="BaseResultMap" type="com.rymcu.forest.lucene.model.PortfolioLucene">
<id column="id" property="idPortfolio"></id>
<result column="id" property="idPortfolio"></result>
<result column="portfolio_title" property="portfolioTitle"></result>
<result column="portfolio_description" property="portfolioDescription"></result>
</resultMap>
<resultMap id="DTOResultMap" type="com.rymcu.forest.dto.PortfolioDTO">
<result column="id" property="idPortfolio"></result>
<result column="portfolio_head_img_url" property="headImgUrl"></result>
<result column="portfolio_title" property="portfolioTitle"></result>
<result column="portfolio_author_id" property="portfolioAuthorId"></result>
<result column="portfolio_description" property="portfolioDescription"></result>
<result column="updated_time" property="updatedTime"></result>
</resultMap>
<select id="getAllPortfolioLucene" resultMap="BaseResultMap">
SELECT id, portfolio_title, portfolio_description
FROM forest_portfolio
</select>
<select id="getPortfoliosByIds" resultMap="DTOResultMap">
select id, portfolio_head_img_url, portfolio_author_id, portfolio_description, updated_time from forest_portfolio where
id in
<foreach collection="ids" item="id" index="index"
open="(" close=")" separator=",">
#{id}
</foreach>
order by
field(id
<foreach collection="ids" item="id" index="index"
open="," close=")" separator=",">
#{id}
</foreach>
</select>
<select id="getById" resultMap="BaseResultMap">
SELECT id, portfolio_title, portfolio_description
FROM forest_portfolio
where id = #{id};
</select>
</mapper>

View File

@ -0,0 +1,30 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd" >
<mapper namespace="com.rymcu.forest.lucene.mapper.UserDicMapper">
<select id="getAllDic" resultType="java.lang.String">
select dic
from forest_lucene_user_dic
</select>
<select id="getAll" resultType="com.rymcu.forest.lucene.model.UserDic">
select *
from forest_lucene_user_dic
</select>
<insert id="addDic">
insert into forest_lucene_user_dic(dic) value (#{dic})
</insert>
<delete id="deleteDic">
delete
from forest_lucene_user_dic
where id = (#{id})
</delete>
<update id="updateDic">
update forest_lucene_user_dic
set dic=#{dic}
where id = (#{id})
</update>
</mapper>

View File

@ -0,0 +1,42 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd" >
<mapper namespace="com.rymcu.forest.lucene.mapper.UserLuceneMapper">
<resultMap id="BaseResultMap" type="com.rymcu.forest.lucene.model.UserLucene">
<id column="id" property="idUser" jdbcType="INTEGER"/>
<result column="nickname" property="nickname" jdbcType="VARCHAR"/>
<result column="signature" property="signature" jdbcType="VARCHAR"/>
</resultMap>
<resultMap id="DTOResultMapper" type="com.rymcu.forest.dto.UserDTO">
<result column="id" property="idUser"/>
<result column="account" property="account"/>
<result column="nickname" property="nickname"/>
<result column="avatar_type" property="avatarType"/>
<result column="avatar_url" property="avatarUrl"/>
<result column="signature" property="signature"/>
</resultMap>
<select id="getAllUserLucene" resultMap="BaseResultMap">
SELECT id, nickname, signature
FROM forest_user
</select>
<select id="getUsersByIds" resultMap="DTOResultMapper">
select id, nickname, avatar_type, avatar_url, account, signature from forest_user where
id in
<foreach collection="ids" item="id" index="index"
open="(" close=")" separator=",">
#{id}
</foreach>
order by
field(id
<foreach collection="ids" item="id" index="index"
open="," close=")" separator=",">
#{id}
</foreach>
</select>
<select id="getById" resultMap="BaseResultMap">
SELECT id, nickname, signature
FROM `forest_user`
where id = #{id};
</select>
</mapper>

View File

@ -0,0 +1,8 @@
屌丝男士
吊死男士
diaosinanshi
男屌丝
男吊丝士
吊男丝士

View File

@ -0,0 +1,4 @@
全文搜索
苏雄
杨跃进
癜风

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,316 @@
世纪
位数
像素
克拉
公亩
公克
公分
公升
公尺
公担
公斤
公里
公顷
分钟
分米
加仑
千克
千米
厘米
周年
小时
平方
平方公尺
平方公里
平方分米
平方厘米
平方码
平方米
平方英寸
平方英尺
平方英里
平米
年代
年级
月份
毫升
毫米
毫克
海里
点钟
盎司
秒钟
立方公尺
立方分米
立方厘米
立方码
立方米
立方英寸
立方英尺
英亩
英寸
英尺
英里
阶段

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,64 @@
a
an
and
are
as
at
be
but
by
for
if
in
into
is
it
no
not
of
on
or
such
that
the
their
then
there
these
they
this
to
was
will
with
使

2108
src/main/resources/lucene/web.dic Executable file

File diff suppressed because it is too large Load Diff

View File

@ -340,6 +340,18 @@ create table forest_visit
)
comment '浏览表' collate = utf8mb4_unicode_ci;
create table forest_lucene_user_dic
(
id int auto_increment comment '字典编号',
dic char(32) null comment '字典',
constraint forest_lucene_user_dic_id_uindex
unique (id)
)
comment '用户扩展字典';
alter table forest_lucene_user_dic
add primary key (id);
insert into forest.forest_role (id, name, input_code, status, created_time, updated_time, weights) values (1, '管理员', 'admin', '0', '2019-11-16 04:22:45', '2019-11-16 04:22:45', 1);
insert into forest.forest_role (id, name, input_code, status, created_time, updated_time, weights) values (2, '社区管理员', 'blog_admin', '0', '2019-12-05 03:10:05', '2019-12-05 17:11:35', 2);
insert into forest.forest_role (id, name, input_code, status, created_time, updated_time, weights) values (3, '作者', 'zz', '0', '2020-03-12 15:07:27', '2020-03-12 15:07:27', 3);
@ -347,4 +359,4 @@ insert into forest.forest_role (id, name, input_code, status, created_time, upda
insert into forest.forest_user (id, account, password, nickname, real_name, sex, avatar_type, avatar_url, email, phone, status, created_time, updated_time, last_login_time, signature) values (1, 'admin', '8ce2dd866238958ac4f07870766813cdaa39a9b83a8c75e26aa50f23', 'admin', 'admin', '0', '0', null, null, null, '0', '2021-01-25 18:21:51', '2021-01-25 18:21:54', null, null);
insert into forest.forest_user_role (id_user, id_role, created_time) values (1, 1, '2021-01-25 18:22:12');
insert into forest.forest_user_role (id_user, id_role, created_time) values (1, 1, '2021-01-25 18:22:12');