（5）文本挖掘（二）——中英文分词

缺乏、安全感 2022-07-28 01:21 176阅读 0赞

**一、分词抽象类Segment**

package segment;
    
    import java.util.HashSet;
    import java.util.Set;
    import util.ReadTXT;
    
    /** * 分词抽象类 * @author Angela */
    public abstract class Segment { 
    
        protected Set<String> stopwords;//停用词
    
        /** * 构造函数，初始化各个属性 */
        public Segment(){
            stopwords=new HashSet<String>();
        }
    
        /** * 构造函数，初始化各个属性，初始化停用词集 * @param stopwordPath 停用词文件路径 */
        public Segment(String stopwordPath){
            stopwords=ReadTXT.toSet(stopwordPath);
        }
    
        /** * 对字符串内容进行分词 * @param content 内容 * @return 由空格符作为分隔符的分词结果String */
        public abstract String segment(String content);
    
        /** * @return the stopwords */
        public Set<String> getStopwords() {
            return stopwords;
        }
    
        /** * @param stopwords the stopwords to set */
        public void setStopwords(Set<String> stopwords) {
            this.stopwords = stopwords;
        }
    
    }

**二、英文分词类EnglishSegment**

英文分词需要用到词干提取算法PorterAlgorithm，请参考[http://blog.csdn.net/fighting\_no1/article/details/50927162][http_blog.csdn.net_fighting_no1_article_details_50927162]。停用词是我自己整理的。

这里使用Java的lucene开源项目中的StandardAnalyzer标准分词器来对英文进行分词，并使用PorterStem词干提取算法进行词干提取。

package segment;
    
    import java.io.IOException;
    import java.util.logging.Level;
    import java.util.logging.Logger;
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.en.PorterStemFilter;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
    import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
    
    /** * 英文分词类 * @author Angela */
    public class EnglishSegment extends Segment{ 
    
        private Analyzer analyzer;//英文分词器
    
        public EnglishSegment(){
            super();
            analyzer=new StandardAnalyzer();//默认标准分词器
        }
    
        public EnglishSegment(String stopwordPath){
            super(stopwordPath);//设置停用词
            analyzer=new StandardAnalyzer();//默认标准分词器
        }
    
        /** * 英文分词 * step1 英文词法分析，去除数字、连字符、标点符号、特殊字符 * step2 去停用词 * step3 词干提取 * @param content 文本内容 * @return */
        public String segment(String content){
            StringBuilder sb=new StringBuilder();      
            try {
                TokenStream tokenStream = analyzer.tokenStream(null, content);
                //设置波特词干提取器，自动去除停用词
                tokenStream=new PorterStemFilter(tokenStream);
                OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);  
                CharTermAttribute term=tokenStream.addAttribute(CharTermAttribute.class);
                tokenStream.reset();//重置
                while( tokenStream.incrementToken() ){
                    int startOffset = offsetAttribute.startOffset(); 
                    int endOffset = offsetAttribute.endOffset(); 
                    //去除数字
                    String word=term.toString().replaceAll("[^a-zA-Z]", "");
                    if(word.length()!=0&&!stopwords.contains(word))
                        sb.append(word+" ");
                }
                tokenStream.end();
                tokenStream.close();
            } catch (IOException ex) {
                Logger.getLogger(EnglishSegment.class.getName()).log(Level.SEVERE, null, ex);
            }      
            return sb.toString();
        }    
    
        public void setAnalyzer(Analyzer analyzer) {
            this.analyzer = analyzer;
        }
    
        public static void main(String args[]){
            String content="Clusters are created from 433short 4.5 snippets。9 of documents retrieved by web search engines which are as good as clusters created from the full text of web documents.";
            EnglishSegment es=new EnglishSegment();
            String result=es.segment(content);
            System.out.println(result);
        }
    }

**三、中文分词类ChineseSegment**

中文分词这里用的是mmseg4j工具包，词典和停用词都是我自己整理的。关于MMSEG分词算法，请参考[http://blog.csdn.net/fighting\_no1/article/details/50927171][http_blog.csdn.net_fighting_no1_article_details_50927171]。

/* * To change this template, choose Tools | Templates * and open the template in the editor. */
    package segment;
    
    import com.chenlb.mmseg4j.ComplexSeg;
    import com.chenlb.mmseg4j.Dictionary;
    import com.chenlb.mmseg4j.MMSeg;
    import com.chenlb.mmseg4j.Seg;
    import com.chenlb.mmseg4j.Word;
    import java.io.IOException;
    import java.io.Reader;
    import java.io.StringReader;
    import java.util.logging.Level;
    import java.util.logging.Logger;
    import util.ReadTXT;
    import util.WriteTXT;
    
    /** * 中文分词类 * @author Angela */
    public class ChineseSegment extends Segment{ 
    
        protected Dictionary dic;//词典
    
        public ChineseSegment(){
            super();
            this.dic = Dictionary.getInstance();
        }
    
        public ChineseSegment(String stopwordPath){
            super(stopwordPath);
            this.dic = Dictionary.getInstance();
        }
    
        /**分词并去掉停用词,去除数字字母等，只保留中文**/
        public String segment(String content){
            Reader input=new StringReader(content);
            StringBuilder sb = new StringBuilder();
            Seg seg = new ComplexSeg(dic);//正向最大匹配, 加四个过滤规则的分词方式
            MMSeg mmSeg = new MMSeg(input, seg);
            Word word = null;
            try {
                while((word=mmSeg.next())!=null) {
                    //去除非中文内容
                    String w = word.getString().replaceAll("[^\\u4e00-\\u9fa5]*", "");                
                    if(!stopwords.contains(w)&&w.length()!=0){
                        sb.append(w+" ");
                    }
                }
            } catch (IOException ex) {
                Logger.getLogger(ChineseSegment.class.getName()).log(Level.SEVERE, null, ex);
            }
            return sb.toString();
        }
    
        public static void main(String[] args) throws IOException {
            String stopwordPath="dic/cstopword.dic";
            ChineseSegment cs=new ChineseSegment(stopwordPath);
            /*String txt = "@Àα⒈①ⅷλ∅βχ∩✔☚☹★◀ΔουÏ.？]［■‥♪ねひみつㅙㅘ⒂⒖ " + "京华时报２００８年1月23日报道 昨天，受一股来自中西伯利亚的强冷空气影响，" + "本市出现大风降温天气，白天最高气温只有零下7摄氏度，同时伴有6.3到7.8级的偏北风。"; System.out.println(cs.segment(txt)); ChineseSegment cs=new ChineseSegment(); */
            String filePath="E:\\各种数据集\\复旦语料库\\test\\C11-Space\\C11-Space0002.txt";
            String result=cs.segment(ReadTXT.read(filePath));
            WriteTXT.write(result,"segment2.txt");
        }
    
    }

**四、文本分词类TextSegment**

/* * To change this template, choose Tools | Templates * and open the template in the editor. */
    package segment;
    
    import java.io.File;
    import java.util.HashMap;
    import java.util.Map;
    import util.MapUtil;
    import util.ReadTXT;
    import util.WriteTXT;
    
    /** * 文本分词类，分词并统计TF * @author Angela */
    /* * 注意： * 1、输入的数据集路径最多是两级目录 * 2、分词结果保存在与数据集同级的路径下 */
    public class TextSegment { 
    
        private Segment seg;//分词类 
    
        public TextSegment(Segment seg){
            this.seg=seg;       
        }
    
        /** * 对文本进行分词 * @param filePath 文本路径 * @return */
        public String segment(String filePath){
            String content=ReadTXT.read(filePath);
            //System.out.println(content);
            String result=seg.segment(content);
            return result;
        }
    
        /** * 对文本进行分词，计算TF * @param filePath 文本路径 * @return */
        public Map<String,Integer> getTF(String filePath){
            String result=segment(filePath);
            String[] words=result.split(" ");
            Map<String,Integer> tf=new HashMap<String,Integer>();
            for(String word: words){
                if(tf.containsKey(word)){
                    tf.put(word, tf.get(word)+1);
                }else{
                    tf.put(word, 1);
                }
            }
            return tf;
        }
    
        /** * 对文本filePath进行分词，保存分词结果到tarPath * @param filePath 一篇文本的路径 * @param tarPath 分词结果的保存路径 */
        public void saveTF(String filePath,String tarPath){
            Map<String,Integer> tf=getTF(filePath); 
            tf=MapUtil.desc(tf);//降序排序，方便查看
            WriteTXT.writeMap(tf, tarPath);
            System.out.println("Saved "+tarPath);
        }
    
        /** * 一次性对一个类别（一个文件夹）下的所有文本进行分词， * 保存每篇文本的分词结果到tarPath路径下以该文本的名字命名的文件中； * @param filePath 文件夹路径 * @param savePath 保存路径 */
        public void saveTFOfLabel(String filePath,String savePath){
            File f=new File(savePath);
            if(!f.exists()){
                f.mkdir();
            }
            File textSet=new File(filePath);        
            File[] texts=textSet.listFiles();
            int n=texts.length;
            for(int i=0;i<n;i++){   
                String tarPath=savePath+"\\"+texts[i].getName();
                saveTF(texts[i].getAbsolutePath(),tarPath);
            } 
        }
    
        /** * 一次性对多个类别（多个文件夹）下的所有文本进行分词，保存每篇文本的 * 分词结果到tarPath路径下该文本对应的类别的以该文本的名字命名的文件中； * @param filePath 文件夹路径 * @param savePath 保存路径 */
        public void saveTFOfLabels(String filePath,String savePath){
            File file=new File(savePath);
            if(!file.exists()){
                file.mkdir();
            }
            File textSet=new File(filePath);
            File[] classes=textSet.listFiles();
            int n=classes.length;
            for(int i=0;i<n;i++){
                String label=classes[i].getName();
                String tarPath=savePath+"\\"+label;
                saveTFOfLabel(classes[i].getAbsolutePath(),tarPath);
            }
        }
    
        /** * 分词并保存TF结果到与filePath同级的路径下， * 会自动识别filePath是文件，单级目录还是两级目录； * @param filePath 文件路径 */
        public void save(String filePath){
            String tarPath=null;       
            File files=new File(filePath);
            if(files.isDirectory()){
                tarPath=filePath+"TF";
                File[] file=files.listFiles();
                if(file[0].isDirectory()){
                    saveTFOfLabels(filePath, tarPath);
                }else{
                    saveTFOfLabel(filePath, tarPath);
                }
            }else{
                String fileName=filePath.substring(0,filePath.lastIndexOf("."));
                tarPath=fileName+"TF.txt";
                saveTF(filePath, tarPath);
            }
        } 
    
        /** * 读取分词后的文本集，保存其TF结果，整个文本集的DF和IDF结果 * 同时计算总文本数，总词数，文本平均词数 * @param filePath 要分词的文本集路径 * @param savePath TF保存路径 * @param DFSavePath DF保存路径 * @param IDFSavePath IDF保存路径 */
        public void saveTFDFIDF(String filePath,String savePath,
                String DFSavePath,String IDFSavePath){
            File path=new File(savePath);
            if(!path.exists()) path.mkdir();
            File file=new File(filePath);
            File[] labels=file.listFiles();//类别
            int textNum=0;//总文本数
            long wordNum=0;//文本集的总词数
            Map<String,Integer> DF=new HashMap<String,Integer>();
            Map<String,Double> IDF=new HashMap<String,Double>();
            for(File label: labels){
                String labelName=label.getName();
                String tarLabel=savePath+File.separator+labelName;
                File labelpath=new File(tarLabel);
                if(!labelpath.exists()) labelpath.mkdir();
                File[] texts=label.listFiles();//文本
                textNum+=texts.length;
                for(File text: texts){
                    String tarPath=tarLabel+File.separator+text.getName();
                    //每篇文本的TF集合
                    Map<String,Integer> TF=getTF(text.getAbsolutePath());
                    WriteTXT.writeMap(MapUtil.desc(TF),tarPath);
                    for(Map.Entry<String,Integer> me: TF.entrySet()){
                        String f=me.getKey();
                        wordNum+=me.getValue();
                        //计算DF
                        if(DF.containsKey(f)) DF.put(f, DF.get(f)+1);
                        else DF.put(f, 1);
                    }
                }           
            }
            WriteTXT.writeMap(MapUtil.desc(DF),DFSavePath);//保存DF结果
            System.out.println("总文本数："+textNum);
            System.out.println("总词数："+wordNum);
            System.out.println("文本平均词数："+wordNum*1.0/textNum);        
            for(Map.Entry<String,Integer> me: DF.entrySet()){
                IDF.put(me.getKey(), Math.log(textNum*1.0/me.getValue()));
            }
            WriteTXT.writeMap(MapUtil.desc(IDF),IDFSavePath);
        } 
    
        public static void main(String args[]){  
            String stopwordPath="dic/cstopword.dic";
            //文本集路径
            String filePath="E:\\各种数据集\\复旦语料库\\test";
            //创建TextSegment对象，参数=true为对中文进行分词，=false为对英文进行分词
            ChineseSegment seg=new ChineseSegment();
            TextSegment ts=new TextSegment(seg);
            //System.out.println(ts.segment(filePath));
            //Map<String,Integer> tf=ts.getTF(filePath);
            //tf=MapUtil.desc(tf);
            //MapUtil.print(tf);
            //System.out.println(tf.size());
            //保存分词结果的TF集，保存路径为filePath+"TF"
            //分词过程为：1、读取文本；2、统计文本特征的tf值；3、保存文本的TF结果
            //ts.save(filePath);
            ts.saveTFDFIDF(filePath, filePath+"TF", "testDF.txt", "testIDF.txt");
            //注意这里的filePath可以是一个文本路径，一个文件夹路径，两层目录的路径
            //save（）这个方法会自动识别你传入的参数是文件还是目录
        }
    
    }

通过TextSegment类，我们就把复旦大学语料库和路透社语料库都分词并统计TF好了。下一节我将介绍TFIDF和文本向量空间模型VSM。

[http_blog.csdn.net_fighting_no1_article_details_50927162]: http://blog.csdn.net/fighting_no1/article/details/50927162
[http_blog.csdn.net_fighting_no1_article_details_50927171]: http://blog.csdn.net/fighting_no1/article/details/50927171