当前位置 : 主页 > 编程语言 > c++ >

读取全英文txt,统计单词的词频并输出

来源:互联网 收集:自由互联 发布时间:2021-07-03
import java.io.BufferedReader;import java.io.File;import java.io.FileReader;import java.util.ArrayList;import java.util.Iterator;import java.util.List;import java.util.Set;import java.util.TreeSet; public class TestTxt { File file; String c
 
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
  
public class TestTxt {
      
    File file;
    String content;  //保存文章内容
    String[] rawWords;  //保存单个单词集合
    String[] words;  //保存各个单词对应的词频
    int[] wordFreqs; //输入文章内容
          
    public static String txtToString(File file){  //读取文件
        String result = "";
        try{
            BufferedReader br = new BufferedReader(new FileReader(file));  //构造一个BufferedReader类来读取文件
            String s = null;
            while((s = br.readLine())!=null){  //使用readLine方法,一次读一行
                result = result + s + "\\n";
            }
            br.close();    
        }catch(Exception e){
            e.printStackTrace();
        }
        return result;
    }
      
    public  void splitWord(){   //对文章根据分隔符进行分词,将结果保存到rawWords数组中
        final char SPACE = ' ';  //分词的时候,所有的符号全部替换为空格
        content = content.replace('\\'', SPACE).replace(',', SPACE).replace('.', SPACE);
        content = content.replace('(', SPACE).replace(')', SPACE).replace('-', SPACE);
        rawWords = content.split("\\\\s+");  //凡是空格隔开的都算单词
        }
      
    public void countWordFreq(){  //统计单词个数
        Set<String> set = new TreeSet<String>();  //将所有出现的字符串放入唯一的set中
        for(String word: rawWords){
            set.add(word);
            }
        System.out.println(set);
        Iterator ite = set.iterator();
        List<String> wordsList = new ArrayList<String>();  //开辟空间函数
        List<Integer> freqList = new ArrayList<Integer>();
        while(ite.hasNext()){
            String word = (String) ite.next();
            int count = 0;   //统计相同字符串的个数
            for(String str: rawWords){
                if(str.equals(word)){
                    count++;
                    }
                }
            wordsList.add(word);
            freqList.add(count++);
            }
        words = wordsList.toArray(new String[0]);  //存入数组当中
        wordFreqs = new int[freqList.size()];
        for(int i = 0; i < freqList.size(); i++){
            wordFreqs[i] = freqList.get(i);
            }
        }
      
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        TestTxt t = new TestTxt();
        t.file = new File("D:/test1.txt");
        t.content = txtToString(t.file);
        t.splitWord();
        t.countWordFreq();
        for(int i = 0;i < t.wordFreqs.length ; i++){
            System.out.println(t.words[i] + ":" + t.wordFreqs[i]);
        }
    }
  
}

网友评论