2

我使用的是Weka自带的示例数据集

  • weka.core.Instances
    容纳完整的数据集,可以提取行信息与列信息
  • weka.core.Instance
    一行信息,没有列信息
  • weka.core.Attribute
    列信息

一、加载数据

1.从文件中加载数据

基本的读取数据方式为

Instances data1=DataSource.read("data\\iris.arff");

如果文件的拓展名未知,我们可以指定加载器进行加载,例如我们可以把之前的iris.arff文件改成iris.data,然后通过指定加载器加载本地数据

package weka.loaddata;

import java.io.File;

import weka.core.Instances;
import weka.core.converters.ArffLoader;

public class Test {
    public static void main(String[] args) {
        try {
            ArffLoader loader=new ArffLoader();
            loader.setSource(new File("data\\iris.data"));
            Instances data1=loader.getDataSet();
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        System.out.println("done");
    }
}

arff和csv需要人为指定作为类别的字段

package weka.loaddata;

import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSource;

public class Test {
    public static void main(String[] args) {
        try {
            Instances data1=DataSource.read("data\\iris.arff");
            System.out.println(data1.classIndex());
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        System.out.println("done");
    }
}

返回-1代表此时并没有指定类别属性

package weka.loaddata;

import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSource;

public class Test {
    public static void main(String[] args) {
        try {
            Instances data1=DataSource.read("data\\iris.arff");
            data1.setClassIndex(data1.numAttributes()-1);
            System.out.println(data1.classIndex());
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        System.out.println("done");
    }
}

于是我们通过上述程序将最后一个属性作为类别属性

2.从数据库加载数据

一种是InstanceQuery,允许检索系数数据,一种是DatabaseLoader,允许增量检索

package weka.loaddata;

import weka.core.Instances;
import weka.experiment.InstanceQuery;

public class Test {
    public static void main(String[] args) throws Exception {
        InstanceQuery query = new InstanceQuery();
        query.setDatabaseURL("jdbc:mysql://localhost:3306/new_schema");
        query.setUsername("root");
        query.setPassword("*******");
        query.setQuery("select * from iris");
        Instances data = query.retrieveInstances();
        System.out.println("done");
    }
}

我首先将iris数据加载进mysql数据库了
如果你用过jdbc的话,会发现这几个东西就是用的jdbc

package weka.loaddata;

import weka.core.Instances;
import weka.core.converters.DatabaseLoader;

public class Test {
    public static void main(String[] args) throws Exception {
        DatabaseLoader loader = new DatabaseLoader();
        loader.setSource("jdbc:mysql://localhost:3306/new_schema", "root",
                "*******");
        loader.setQuery("select * from iris");
        Instances data = loader.getDataSet();
    }
}

批量检索

package weka.loaddata;

import weka.core.Instance;
import weka.core.Instances;
import weka.core.converters.DatabaseLoader;

public class Test {
    public static void main(String[] args) throws Exception {
        DatabaseLoader loader = new DatabaseLoader();
        loader.setSource("jdbc:mysql://localhost:3306/new_schema", "root",
                "zxy123456");
        loader.setQuery("select * from iris");
        Instances structure = loader.getStructure();
        Instances data = new Instances(structure);
        Instance inst;
        while ((inst = loader.getNextInstance(structure)) != null)
            data.add(inst);
        System.out.println("done");
    }
}

增量检索

3.手把手

二、保存数据

package weka.loaddata;

import java.io.File;
import java.io.FileOutputStream;

import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSink;
import weka.core.converters.ConverterUtils.DataSource;
import weka.core.converters.XRFFSaver;

public class Test {
    public static void main(String[] args) throws Exception {
        Instances data = DataSource.read("data/iris.arff");
        DataSink.write("data/write_iris.csv", data);
        
        FileOutputStream arff = new FileOutputStream("data/write_iris.arff");
        DataSink.write(arff, data);
        arff.close();

        XRFFSaver saver = new XRFFSaver();
        saver.setInstances(data);
        saver.setFile(new File("data/write_iris.xrff"));
        saver.writeBatch();
        System.out.println("done");

    }
}

可以直接写,也可以指定加载器

2.保存数据到数据库

package weka.loaddata;

import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSource;
import weka.core.converters.DatabaseSaver;

public class Test {
    public static void main(String[] args) throws Exception {
        Instances data = DataSource.read("data/iris.arff");
        DatabaseSaver saver = new DatabaseSaver();
        saver.setDestination("jdbc:mysql://localhost:3306/new_schema", "root",
                "zxy123456");
        saver.setTableName("write_iris");
        saver.setRelationForTableName(false);
        saver.setInstances(data);
        saver.writeBatch();
        System.out.println("done");

    }
}

saver.setRelationForTableName(false);
如果是true的话,只能将数据的relation名作为表名,当然也可以改关系名啦

data.setRelationName(newName);
package weka.loaddata;

import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSource;
import weka.core.converters.DatabaseSaver;

public class Test {
    public static void main(String[] args) throws Exception {
        Instances data = DataSource.read("data/iris.arff");
        DatabaseSaver saver = new DatabaseSaver();
        saver.setDestination("jdbc:mysql://localhost:3306/new_schema", "root",
                "zxy123456");
        saver.setTableName("write_iris");
        saver.setRelationForTableName(false);

        saver.setRetrieval(DatabaseSaver.INCREMENTAL);
        saver.setInstances(data);
        for (int i = 0; i < data.numInstances(); i++) {
            saver.writeIncremental(data.instance(i));
        }
        saver.writeIncremental(null);
        System.out.println("done");

    }
}

增量保存,看起来就是一条一条存

三、处理选项

package weka.loaddata;

import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSource;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.Remove;

public class Test {
    public static void main(String[] args) throws Exception {
        Instances data = DataSource.read("data/iris.arff");
        System.out.println(data);
        System.out.println("----------------");
        String[] options = new String[2];
        options[0] = "-R";
        options[1] = "1";
        Remove rm = new Remove();
        rm.setOptions(options);
        rm.setInputFormat(data);
        Instances inst1 = Filter.useFilter(data, rm);
        System.out.println(inst1);

    }
}

意思是去除一个属性,其他的东西该如何应用,还是看看api吧,

Instances inst1 = Filter.useFilter(data, rm);

这个应该是

clipboard.png

的使用方法,猜的,应该是。

四、内存数据集处理

1.在内存中创建数据集

可以通过内存提取数据,总共分两步
首先设置属性定义数据格式
其次一行一行地添加实际数据

1.1 定义数据格式

  • numeric:数值型,连续变量
Attribute numeric=new Attribute("attribute_name");
  • date:日期型,日期变量
Attribute date=new Attribute("attribute_name","yyyy-MM-dd");

具体日期格式参照SimpleDateFormat中对日期的规定

  • nominal:标称型,预定义的标签
ArrayList<String> labels=new ArrayList<String>();
        labels.add("label_a");
        labels.add("label_b");
        Attribute nominal=new Attribute("attribute_name",labels);
  • string:字符串型,文本数据
Attribute string = new Attribute("attribute_name",(ArrayList<String>)null);

提供一个ArrayList的你null对象

  • relation:关系型
        ArrayList<Attribute> atts = new ArrayList<Attribute>();
        atts.add(new Attribute("rel.numeric"));
        ArrayList<String> values = new ArrayList<String>();
        values.add("val_A");
        values.add("val_B");
        values.add("val_C");
        atts.add(new Attribute("rel.nominal"));
        Instances rel_struct = new Instances("rel", atts, 0);
        Attribute relational = new Attribute("attribute_name", rel_struct);

atts里有一个numeric属性和一个nominal属性,然后创建了一个大小为0的instances对象。然后利用这个instances创建了这个relation数据属性。

        Attribute num1 = new Attribute("num1");
        Attribute num2 = new Attribute("num2");

        ArrayList<String> labels = new ArrayList<String>();
        labels.add("no");
        labels.add("yes");
        Attribute cls = new Attribute("class", labels);

        ArrayList<Attribute> attributes = new ArrayList<>();
        attributes.add(num1);
        attributes.add(num2);
        attributes.add(cls);

        Instances dataset = new Instances("relation_name", attributes, 0);

我们创建了num1,num2,cls三个属性,然后创建了这个数据集的instances对象,

clipboard.png

1.2 添加数据

package weka.api;

import java.util.ArrayList;

import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;

public class Test {
    public static void main(String[] args) throws Exception {

        Attribute numeric = new Attribute("numeric");
        Attribute date = new Attribute("date", "yyyy-MM-dd");

        ArrayList<String> label = new ArrayList<String>();
        label.add("label_a");
        label.add("label_b");
        label.add("label_c");
        Attribute nominal = new Attribute("nominal", label);

        Attribute string = new Attribute("string", (ArrayList) null);

        // ArrayList<Attribute> rel_attributes = new ArrayList<>();
        // rel_attributes.add(numeric);
        // rel_attributes.add(nominal);
        // Instances rel_struct = new Instances("rel_struct", rel_attributes,
        // 1);
        // Attribute relation = new Attribute("relation", rel_struct);

        ArrayList<Attribute> attributes = new ArrayList<>();
        attributes.add(numeric);
        attributes.add(date);
        attributes.add(nominal);
        attributes.add(string);
        // attributes.add(relation);
        Instances data = new Instances("data", attributes, 1);

        double[] values = new double[data.numAttributes()];
        values[0] = 1.23;
        values[1] = data.attribute(1).parseDate("2017-8-19");
        values[2] = data.attribute(2).indexOfValue("label_c");
        System.out.println(values[2]);
        values[3] = data.attribute(3).addStringValue("A string");
        // Instances dataRel=new Instances(data.attribute(4).relation(), 0);
        // double[] valuesRel=new double[dataRel.numAttributes()];
        // valuesRel[0]=2.34;
        // valuesRel[1]=dataRel.attribute(1).indexOfValue("label_c");
        // dataRel.add(new DenseInstance(1.0,valuesRel));
        // values[4]=data.attribute(4).addRelation(dataRel);

        Instance inst = new DenseInstance(1, values);
        data.add(inst);

        System.out.println(data);

    }
}

relation这个东西我还不太会用。。。所以注释掉了
需要注意的是在使用nominal属性的时候,如果添加的值不在之前的声明之中,他会返回-1,却不会报错,而在使用的时候才会报错,而且还找不到哪里错误,从这点来看他们这个API写的实在有点= =粗糙。。。。

package weka.api;

import java.util.ArrayList;

import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;

public class Test {
    public static void main(String[] args) throws Exception {
        ArrayList<Attribute> atts;
        ArrayList<Attribute> attsRel;
        ArrayList<String> attVals;
        ArrayList<String> attValsRel;

        Instances data;
        Instances dataRel;

        double[] vals;
        double[] valsRel;
        int i = 0;

        atts = new ArrayList<Attribute>();
        atts.add(new Attribute("att1"));

        attVals = new ArrayList<String>();
        for (i = 0; i < 5; i++) {
            attVals.add("val" + (i + 1));
        }
        atts.add(new Attribute("att2", attVals));

        atts.add(new Attribute("att3", (ArrayList<String>) null));

        atts.add(new Attribute("att4", "yyyy-MM-dd"));

        attsRel = new ArrayList<Attribute>();
        attsRel.add(new Attribute("att5.1"));
        attValsRel = new ArrayList<String>();
        for (i = 0; i < 5; i++) {
            attValsRel.add("val5." + (i + 1));
        }
        attsRel.add(new Attribute("att5.2", attValsRel));
        dataRel = new Instances("att5", attsRel, 0);
        atts.add(new Attribute("att5", dataRel, 0));

        
        data=new Instances("MyRelation",atts,0);
        
        vals=new double[data.numAttributes()];
        vals[0]=Math.PI;
        vals[1]=attVals.indexOf("val3");
        vals[2]=data.attribute(2).addStringValue("a string");
        vals[3]=data.attribute(3).parseDate("2017-8-19");
        dataRel=new Instances(data.attribute(4).relation(),0);
        valsRel=new double[2];
        valsRel[0]=Math.PI+1;
        valsRel[1]=attValsRel.indexOf("val5.3");
        dataRel.add(new DenseInstance(1,valsRel));
        vals[4]=data.attribute(4).addRelation(dataRel);
        data.add(new DenseInstance(1,vals));
        System.out.println(data);
        
                
    }
}

这个例子比之前我的好,不过关系型属性是真的麻烦,不过理解起来就好像是,一组数据被当做一个特征。

2.打乱数据顺序

package weka.api;

import java.util.Random;

import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSource;

public class Test {
    public static void main(String[] args) throws Exception {
        Instances data = DataSource.read("data\\iris.arff");
        System.out.println(data);

        long seed = 123456;
        Instances data3 = new Instances(data);
        data3.randomize(new Random(seed));
        System.out.println(data3);

    }
}

这是其中一种方法,在这种方法中,推荐使用种子,另外还有可以使用filter的方法进行随机排序,后文继续介绍

五、过滤

1.简单过滤

现在要增加一个数值属性和一个标称属性,并添加随机值

package weka.api;

import java.util.Random;

import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSource;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.Add;

public class Test {
    public static void main(String[] args) throws Exception {
        Instances data = DataSource.read("data\\weather.numeric.arff");
        Instances result = null;

        Add filter;
        result = new Instances(data);

        filter = new Add();
        filter.setAttributeIndex("last");
        filter.setAttributeName("NumericAttribute");
        filter.setInputFormat(result);
        result = Filter.useFilter(result, filter);

        filter = new Add();
        filter.setAttributeIndex("last");
        filter.setNominalLabels("A,B,C");
        filter.setAttributeName("NominalAttribute");
        filter.setInputFormat(result);
        result = Filter.useFilter(result, filter);

        Random rand = new Random(1234);
        for (int i = 0; i < result.numInstances(); i++) {
            result.instance(i).setValue(result.numAttributes() - 2,
                    rand.nextDouble());
            result.instance(i).setValue(result.numAttributes() - 1,
                    rand.nextInt(3));
        }

        System.out.println("过滤后的数据集:");
        System.out.println(result);

    }
}

2.批量过滤

运用了Standardize,将数据集中所有数字属性标准化,零均值与单位方差

package weka.api;

import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSource;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.Standardize;

public class Test {
    public static void main(String[] args) throws Exception {
        Instances train = DataSource.read("data\\segment-challenge.arff");
        Instances test = DataSource.read("data\\segment-test.arff");

        Standardize filter = new Standardize();
        filter.setInputFormat(train);
        Instances newTrain = Filter.useFilter(train, filter);
        Instances newTest = Filter.useFilter(test, filter);

        System.out.println("new trainer");
        System.out.println(newTrain);
        System.out.println("new test");
        System.out.println(newTest);
    }
}

3.即时过滤

package weka.api;

import weka.classifiers.meta.FilteredClassifier;
import weka.classifiers.trees.J48;
import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSource;
import weka.filters.unsupervised.attribute.Remove;

public class Test {
    public static void main(String[] args) throws Exception {
        Instances train = DataSource.read("data\\segment-challenge.arff");
        Instances test = DataSource.read("data\\segment-test.arff");

        train.setClassIndex(train.numAttributes() - 1);
        test.setClassIndex(test.numAttributes() - 1);
        if (!train.equalHeaders(test)) {
            throw new Exception("训练集与测试机不兼容:\n" + train.equalHeadersMsg(test));
        }
        Remove rm = new Remove();
        rm.setAttributeIndices("1");
        J48 j48 = new J48();
        j48.setUnpruned(true);

        FilteredClassifier fc = new FilteredClassifier();
        fc.setFilter(rm);
        fc.setClassifier(j48);

        fc.buildClassifier(train);
        for (int i = 0; i < test.numInstances(); i++) {
            double pred = fc.classifyInstance(test.instance(i));
            System.out.print("index: " + (i + 1));
            System.out.print(", class: " + test.classAttribute()
                    .value((int) test.instance(i).classValue()));
            System.out.println(", predict class: "
                    + test.classAttribute().value((int) pred));
        }

    }
}

解释一下

  • equalHeaders
    Checks if two headers are equivalent. 属性名是否一致

六、分类

1.分类

分类器分为批量分类器和增量分类器
构建批量分类器分为两步

  • 设置选项,可以使用setOptions和set方法
  • 进行训练,调用buildClassifier方法

示例
clipboard.png

增量分类器都实现了UpdateableClassifier接口
增量分类器用于处理规模较大的数据,不会将数据一次加载进内存,arff文件可以增量读取,同样也分两步

  • 调用buildClassifier方法进行模型初始化
  • 调用updateClassifier方法进行一行一行得更新数据

示例
clipboard.png
为啥不带数据,因为之前loader进行的是加载结构的方法

2.分类其评估

构建分类器的评价标准有两种方式,交叉验证和专用测试集验证
评价由Evaluation类实现

示例

clipboard.png

clipboard.png

3.示例

3.1 批量分类器构建

package weka.api;

import java.io.File;

import weka.classifiers.trees.J48;
import weka.core.Instances;
import weka.core.converters.ArffLoader;

public class Test {
    public static void main(String[] args) throws Exception {
        ArffLoader loader=new ArffLoader();
        loader.setFile(new File("data\\weather.nominal.arff"));
        Instances data=loader.getDataSet();
        data.setClassIndex(data.numAttributes()-1);
        
        String[] options=new String[1];
        options[0]="-U";
        J48 tree=new J48();
        tree.setOptions(options);
        tree.buildClassifier(data);
        
        System.out.println(tree);
                
    }
}

4.2 增量分类器构建

package weka.api;

import java.io.File;

import weka.classifiers.bayes.NaiveBayesUpdateable;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.converters.ArffLoader;

public class Test {
    public static void main(String[] args) throws Exception {
        ArffLoader loader = new ArffLoader();
        loader.setFile(new File("data\\weather.nominal.arff"));
        Instances structure = loader.getStructure();
        structure.setClassIndex(structure.numAttributes() - 1);

        NaiveBayesUpdateable nb = new NaiveBayesUpdateable();
        nb.buildClassifier(structure);
        Instance instance;
        while ((instance = loader.getNextInstance(structure)) != null)
            nb.updateClassifier(instance);

        System.out.println(nb);

    }
}

4.3 输出类别分布

package weka.api;

import weka.classifiers.trees.J48;
import weka.core.Instances;
import weka.core.Utils;
import weka.core.converters.ConverterUtils.DataSource;

public class Test {
    public static void main(String[] args) throws Exception {
        Instances train = DataSource.read("data\\segment-challenge.arff");
        Instances test = DataSource.read("data\\segment-test.arff");
        train.setClassIndex(train.numAttributes() - 1);
        test.setClassIndex(test.numAttributes() - 1);

        if (!train.equalHeaders(test)) {
            throw new Exception("不相容");
        }
        J48 classifier = new J48();
        classifier.buildClassifier(train);

        for (int i = 0; i < test.numInstances(); i++) {
            double pred = classifier.classifyInstance(test.instance(i));
            double[] dist = classifier
                    .distributionForInstance(test.instance(i));
            System.out.print((i + 1) + " - "
                    + test.instance(i).toString(test.classIndex()) + " - "
                    + test.classAttribute().value((int) pred) + " - ");
            if (pred != test.instance(i).classValue()) {
                System.out.print("wrong");
            } else {
                System.out.print("correct");
            }
            System.out.println(" - " + Utils.arrayToString(dist));

        }

    }
}

训练了一个分类器,然后一条一跳测试集过,
double pred = classifier.classifyInstance(test.instance(i));是预测结果
double[] dist = classifier.distributionForInstance(test.instance(i));得到的是这条数据的预测各个类的概率

4.5 交叉验证并预测

package weka.api;

import java.util.Random;

import weka.classifiers.AbstractClassifier;
import weka.classifiers.Classifier;
import weka.classifiers.Evaluation;
import weka.core.Instances;
import weka.core.Utils;
import weka.core.converters.ConverterUtils.DataSource;

public class Test {
    public static void main(String[] args) throws Exception {
        Instances data = DataSource.read("data\\ionosphere.arff");
        data.setClassIndex(data.numAttributes() - 1);
        String[] options = new String[2];
        String classname = "weka.classifiers.trees.J48";
        options[0] = "-C";
        options[1] = "0.25";
        Classifier classifier = (Classifier) Utils.forName(Classifier.class,
                classname, options);

        int seed = 1234;
        int folds = 10;

        Random rand = new Random(seed);
        Instances newData = new Instances(data);
        newData.randomize(rand);
        if (newData.classAttribute().isNominal()) {
            newData.stratify(folds);
        }
        Evaluation eval = new Evaluation(newData);
        for (int i = 0; i < folds; i++) {
            Instances train = newData.trainCV(folds, i);
            Instances test = newData.testCV(folds, i);
            Classifier clsCopy = AbstractClassifier.makeCopy(classifier);
            clsCopy.buildClassifier(train);
            eval.evaluateModel(clsCopy, test);
        }
        System.out.println("===分类器设置===");
        System.out.println("分类器:" + Utils.toCommandLine(classifier));
        System.out.println("数据集:" + data.relationName());
        System.out.println("折数:" + folds);
        System.out.println("随机种子:" + seed);
        System.out.println();
        System.out.println(
                eval.toSummaryString("=== " + folds + "折交叉认证===", false));
    }
}

其实不难理解,不过有几个地方需要说
newData.randomize(rand);这个是将数据随机打乱
newData.stratify(folds);这个的api是这么写的

Stratifies a set of instances according to its class values if the class attribute is nominal (so that afterwards a stratified cross-validation can be performed).

意思应该是,如果这个类信息是标称的,那么我们之后如果用的是n折的,比如99个个体共3类,每类都33个,那假如分3折,那前33个里应该每类大约11个左右这样。

4.5 交叉验证并预测

package weka.api;

import java.util.Random;

import weka.classifiers.AbstractClassifier;
import weka.classifiers.Classifier;
import weka.classifiers.Evaluation;
import weka.core.Instances;
import weka.core.OptionHandler;
import weka.core.Utils;
import weka.core.converters.ConverterUtils.DataSource;
import weka.filters.Filter;
import weka.filters.supervised.attribute.AddClassification;

public class Test {
    public static void main(String[] args) throws Exception {
        Instances data = DataSource.read("data\\ionosphere.arff");
        data.setClassIndex(data.numAttributes() - 1);
        String[] options = new String[2];
        String classname = "weka.classifiers.trees.J48";
        options[0] = "-C";
        options[1] = "0.25";
        Classifier classifier = (Classifier) Utils.forName(Classifier.class,
                classname, options);

        int seed = 1234;
        int folds = 10;

        Random rand = new Random(seed);
        Instances newData = new Instances(data);
        newData.randomize(rand);
        if (newData.classAttribute().isNominal()) {
            newData.stratify(folds);
        }

        Instances predictedData = null;
        Evaluation eval = new Evaluation(newData);
        for (int i = 0; i < folds; i++) {
            Instances train = newData.trainCV(folds, i);
            Instances test = newData.testCV(folds, i);
            Classifier clsCopy = AbstractClassifier.makeCopy(classifier);
            clsCopy.buildClassifier(train);
            eval.evaluateModel(clsCopy, test);

            AddClassification filter = new AddClassification();
            filter.setClassifier(classifier);
            filter.setOutputClassification(true);
            filter.setOutputDistribution(true);
            filter.setOutputErrorFlag(true);

            filter.setInputFormat(train);
            Filter.useFilter(train, filter);
            Instances pred = Filter.useFilter(test, filter);

            if (predictedData == null)
                predictedData = new Instances(pred, 0);
            for (int j = 0; j < pred.numInstances(); j++)
                predictedData.add(pred.instance(j));
        }
        System.out.println("===分类器设置===");
        if (classifier instanceof OptionHandler)
            System.out.println("分类器: " + classifier.getClass().getName() + " "
                    + Utils.joinOptions(
                            ((OptionHandler) classifier).getOptions()));
        else
            System.out.println("分类器:" + Utils.toCommandLine(classifier));
        System.out.println("数据集:" + data.relationName());
        System.out.println("折数:" + folds);
        System.out.println("随机种子:" + seed);
        System.out.println();
        System.out.println(
                eval.toSummaryString("=== " + folds + "折交叉认证===", false));
    }
}

这个得好好掰扯掰扯

Classifier clsCopy = AbstractClassifier.makeCopy(classifier);
            clsCopy.buildClassifier(train);Classifier clsCopy = AbstractClassifier.makeCopy(classifier);
            clsCopy.buildClassifier(train);

创建了一空的原始的啥都不知道的分类器,然后再训练集进行了训练

eval.evaluateModel(clsCopy, test);

这是将这个训练好的分类器,运用到测试集上进行测试,这是个累加的过程,可以看到比如第一折测试的时候,测试集有35个,那么这个eval记录了这35个的测试结果,第二折测试集有31个,那么这个eval记录了35+31总共的分类结果。

AddClassification filter = new AddClassification();
filter.setClassifier(classifier);
filter.setOutputClassification(true);
filter.setOutputDistribution(true);
filter.setOutputErrorFlag(true);

doc上写

用于将分类,类分布和错误标记添加到具有分类器的数据集的过滤器。 分类器是对数据本身进行培训或作为序列化模型提供。

其实应该类似于把这个空的Classifier包装了起来,包装成一个过滤器

filter.setInputFormat(train);
            Filter.useFilter(train, filter);
            Instances pred = Filter.useFilter(test, filter);

先设置数据,Filter.useFilter(train, filter);是训练,后一个是预测,运用这个过滤器,在预测的同时还会给数据后面加上三条属性。
但是这两条命令明明相同啊

之后就是把预测结果丢进去就可以了。
end


z_dominic
115 声望15 粉丝

你有freestyle吗?


引用和评论

0 条评论