spark+ansj中文分词

我是在win10中创建的maven项目，使用的是ansj _seg 5.1.6版本

        <dependency>
            <groupId>org.ansj</groupId>
            <artifactId>ansj_seg</artifactId>
            <version>5.1.6</version>
        </dependency>

至于spark环境在win10中的配置，不是这篇文章的重点这里不做介绍。

话不多说，直接上代码：



import java.util

import org.ansj.library.{DicLibrary, StopLibrary}
import org.ansj.recognition.impl.StopRecognition
import org.ansj.splitWord.analysis.ToAnalysis
import org.ansj.util.MyStaticValue
import org.apache.spark.{SparkConf, SparkContext}

import scala.io.Source


  ansjtest {
  def main(args: Array[String]): Unit = {
    val stop = new StopRecognition()
    stop.insertStopNatures(\"w\") // 过滤掉标点
    stop.insertStopNatures(\"m\") // 过滤掉m词性
    stop.insertStopNatures(\"null\") // 过滤null词性
    stop.insertStopNatures(\"<br />\") // 过滤<br />词性
    stop.insertStopNatures(\":\")
    stop.insertStopNatures(\"\'\")
//    stop.insertStopWords(\"的\")

    val conf = new SparkConf().setMaster(\"local\").setAppName(\"anjfencitest\")
    val sc = new SparkContext(conf)
    val input = \"D:\\\\**\\\\新闻.txt\"  // 数据文件
    val output = \"D:\\\\**\\\\resultanj\" // 结果保存路径
    val dicpath = \"D:\\\\**\\\\ambiguity.dic\" // 本地用户词典路径
    val stoppath = \"D:\\\\**\\\\stopword.dic\" // 本地停用词典路径
// 加载用户自定义词典
    DicLibrary.put(\"dic\", dicpath)
    // 加载停用词典
    val file = Source.fromFile(stoppath, \"UTF-8\")
    for(x<- file.getLines()){
      stop.insertStopWords(x.toString())
    }
//    StopLibrary.put(\"stop_dic\", stoppath, stop)  // 这种方式不知道为什么不好用
    val rdd2 = sc.textFile(input)
      .map{x =>
        var str = if (x.length > 0)
          ToAnalysis.parse(x).recognition(stop).toStringWithOutNature(\" \")
        str.toString
    }.flatMap(_.split(\" \"))
      .map((_,1))
      .reduceByKey(_+_)
      .sortBy(_._2,false)
      .saveAsTextFile(output)

    sc.stop()
  }

// 这个方法是从别的博客摘抄而来 https://blog.csdn.net/zh519080/article/details/81224621
  def stopRecognitionFilter(arrayList: util.ArrayList[String]): StopRecognition ={

    MyStaticValue.isQuantifierRecognition = true //数字和量词合并

    val stopRecognition = new StopRecognition

    //识别评论中的介词（p）、叹词（e）、连词（c）、代词（r）、助词（u）、字符串（x）、拟声词（o）
    stopRecognition.insertStopNatures(\"p\", \"e\", \"c\", \"r\", \"u\", \"x\", \"o\")

    stopRecognition.insertStopNatures(\"w\")  //剔除标点符号

    //剔除以中文数字开头的，以一个字或者两个字为删除单位，超过三个的都不删除
    stopRecognition.insertStopRegexes(\"^一.{0,2}\",\"^二.{0,2}\",\"^三.{0,2}\",\"^四.{0,2}\",\"^五.{0,2}\",
      \"^六.{0,2}\",\"^七.{0,2}\",\"^八.{0,2}\",\"^九.{0,2}\",\"^十.{0,2}\")

    stopRecognition.insertStopNatures(\"null\") //剔除空

    stopRecognition.insertStopRegexes(\".{0,1}\")  //剔除只有一个汉字的

    stopRecognition.insertStopRegexes(\"^[a-zA-Z]{1,}\")  //把分词只为英文字母的剔除掉

    stopRecognition.insertStopWords(arrayList)  //添加停用词

    stopRecognition.insertStopRegexes(\"^[0-9]+\") //把分词只为数字的剔除

    stopRecognition.insertStopRegexes(\"[^a-zA-Z0-9\\u4e00-\\\\u9fa5]+\")  //把不是汉字、英文、数字的剔除

    stopRecognition
  }
}

经过测试还是很好用的 ^_^

继续阅读与本文标签相同的文章

无标签

世界上最大的3D打印船

搭建本地/局域网maven仓库

收藏打印

spark+ansj中文分词

浏览：1653 2026-05-07

继续阅读与本文标签相同的文章

世界上最大的3D打印船

搭建本地/局域网maven仓库

特别推荐 2026年05月18日星期一

精彩发现

热门标签

spark+ansj中文分词

浏览：1653 2026-05-07

继续阅读与本文标签相同的文章

2026-05-18栏目： 教程

2026-05-18栏目： 教程

2026-05-18栏目： 教程

2026-05-18栏目： 教程

2026-05-18栏目： 教程

2026-04-23栏目： 教程

2026-04-23栏目： 教程

2026-04-23栏目： 教程

2026-04-23栏目： 教程

2026-04-24栏目： 教程

特别推荐 2026年05月18日 星期一

精彩发现

热门标签

相关文章

2026-05-18栏目：教程

2026-05-18栏目：教程

2026-05-18栏目：教程

2026-05-18栏目：教程

2026-05-18栏目：教程

2026-04-23栏目：教程

2026-04-23栏目：教程

2026-04-23栏目：教程

2026-04-23栏目：教程

2026-04-24栏目：教程

特别推荐 2026年05月18日星期一