当前位置 : 主页 > 网络编程 > 其它编程 >

三、spark入门:文本中发现5个最常用的word,排除常用停用词

来源:互联网 收集:自由互联 发布时间:2023-07-02
packagecom.yl.wordcountimportjava.io.Fileimportorg.apache.spark.{SparkConf,SparkContext}imports package com.yl.wordcountimport java.io.Fileimport org.apache.spark.{SparkConf, SparkContext}import scala.collection.Iteratorimport scala.io.Sourc
packagecom.yl.wordcountimportjava.io.Fileimportorg.apache.spark.{SparkConf,SparkContext}imports package com.yl.wordcountimport java.io.Fileimport org.apache.spark.{SparkConf, SparkContext}import scala.collection.Iteratorimport scala.io.Source/** * wordcount进行排序并排除停用词 */object WordCountStopWords { def main(args: Array[String]) { val cOnf= new SparkConf().setMaster("spark://localhost:7077").setAppName("wordcount") val sc = new SparkContext(conf) val outFile = "/Users/admin/spark/sparkoutput" var stopWords:Iterator[String] = null val stopWordsFile = new File("/Users/admin/src"+"/tingyongci.txt") if(stopWordsFile.exists()){ stopWords = Source.fromFile(stopWordsFile).getLines } val stopWordList = stopWords.toList val textFile = sc.textFile("/Users/admin/spark/spark-1.5.1-bin-hadoop2.4/README.md") val result = textFile.flatMap(_.split(" ")).filter(!_.isEmpty).filter(!stopWordList.contains(_)).map((_,1)).reduceByKey(_+_).map{case (word,count) =>(count,word)}.sortByKey(false) result.saveAsTextFile(outFile) } }
网友评论