代码改变世界

Weka——PrincipalComponents分析

2013-11-11 13:48  Loull  阅读(885)  评论(0编辑  收藏  举报

package weka.filters.unsupervised.attribute;

PrincipalComponents

属性:

  /** The data to transform analyse/transform. */
  protected Instances m_TrainInstances;

  /** Keep a copy for the class attribute (if set). */
  protected Instances m_TrainCopy;

  /** The header for the transformed data format. */
  protected Instances m_TransformedFormat;

  /** Data has a class set. */
  protected boolean m_HasClass;

  /** Class index. */
  protected int m_ClassIndex;

  /** Number of attributes. */
  protected int m_NumAttribs;

  /** Number of instances. */
  protected int m_NumInstances;

  /** Correlation matrix for the original data. */
  protected double[][] m_Correlation;
  
  /** 
   * If true, center (rather than standardize) the data and
   * compute PCA from covariance (rather than correlation)
   * matrix.
   */
  private boolean m_center = false;

  /** Will hold the unordered linear transformations of the (normalized)
      original data. */
  protected double[][] m_Eigenvectors;

  /** Eigenvalues for the corresponding eigenvectors. */
  protected double[] m_Eigenvalues = null;

  /** Sorted eigenvalues. */
  protected int[] m_SortedEigens;

  /** sum of the eigenvalues. */
  protected double m_SumOfEigenValues = 0.0;

  /** Filters for replacing missing values. */
  protected ReplaceMissingValues m_ReplaceMissingFilter;
  
  /** Filter for turning nominal values into numeric ones. */
  protected NominalToBinary m_NominalToBinaryFilter;
  
  /** Filter for removing class attribute, nominal attributes with 0 or 1 value. */
  protected Remove m_AttributeFilter;
  
  /** Filter for standardizing the data */
  protected Standardize m_standardizeFilter;
  
  /** Filter for centering the data */
  protected Center m_centerFilter;

  /** The number of attributes in the pc transformed data. */
  protected int m_OutputNumAtts = -1;  

  /** the amount of varaince to cover in the original data when
      retaining the best n PC's. */
  protected double m_CoverVariance = 0.95;

  /** maximum number of attributes in the transformed attribute name. */
  protected int m_MaxAttrsInName = 5;

  /** maximum number of attributes in the transformed data (-1 for all). */
  protected int m_MaxAttributes = -1;

计算协方差矩阵或相关系数矩阵

  protected void fillCovariance() throws Exception {    
    
    if (!m_center) {
      fillCorrelation();
      return;
    }
    
    double[] att = new double[m_TrainInstances.numInstances()];
    
    // now center the data by subtracting the mean
    m_centerFilter = new Center();
    m_centerFilter.setInputFormat(m_TrainInstances);
    m_TrainInstances = Filter.useFilter(m_TrainInstances, m_centerFilter);
    
    // now compute the covariance matrix
    m_Correlation = new double[m_NumAttribs][m_NumAttribs];
    
    for (int i = 0; i < m_NumAttribs; i++) {
      for (int j = 0; j < m_NumAttribs; j++) {
        
        double cov = 0;
        for (int k = 0; k < m_NumInstances; k++) {
       
          if (i == j) {
            cov += (m_TrainInstances.instance(k).value(i) *
                m_TrainInstances.instance(k).value(i));
          } else {
          cov += (m_TrainInstances.instance(k).value(i) *
              m_TrainInstances.instance(k).value(j));
          }
        }
        
        cov /= (double)(m_TrainInstances.numInstances() - 1);
        m_Correlation[i][j] = cov;
        m_Correlation[j][i] = cov;                
      }
    }
  }

  /**
   * Fill the correlation matrix.
   */
  protected void fillCorrelation() throws Exception {
    int        i;
    int        j;
    int        k;
    double[]     att1;
    double[]     att2;
    double     corr;
    
    m_Correlation = new double[m_NumAttribs][m_NumAttribs];
    att1          = new double [m_NumInstances];
    att2          = new double [m_NumInstances];

    for (i = 0; i < m_NumAttribs; i++) {
      for (j = 0; j < m_NumAttribs; j++) {
        for (k = 0; k < m_NumInstances; k++) {
          att1[k] = m_TrainInstances.instance(k).value(i);
          att2[k] = m_TrainInstances.instance(k).value(j);
        }
    if (i == j) {
      m_Correlation[i][j] = 1.0;
    }
    else {      
      corr = Utils.correlation(att1,att2,m_NumInstances);
      m_Correlation[i][j] = corr;
      m_Correlation[j][i] = corr;
    }
      }
    }
    
    // now standardize the input data
    m_standardizeFilter = new Standardize();
    m_standardizeFilter.setInputFormat(m_TrainInstances);
    m_TrainInstances = Filter.useFilter(m_TrainInstances, m_standardizeFilter);
  }

处理数据

  /**
   * Transform an instance in original (unormalized) format.
   * 
   * @param instance     an instance in the original (unormalized) format
   * @return         a transformed instance
   * @throws Exception     if instance can't be transformed
   */
  protected Instance convertInstance(Instance instance) throws Exception {
    Instance    result;
    double[]     newVals;
    Instance     tempInst;
    double     cumulative;
    int        i;
    int        j;
    double     tempval;
    int        numAttsLowerBound;
    
    newVals  = new double[m_OutputNumAtts];
    tempInst = (Instance) instance.copy();

    m_ReplaceMissingFilter.input(tempInst);
    m_ReplaceMissingFilter.batchFinished();
    tempInst = m_ReplaceMissingFilter.output();    

    m_NominalToBinaryFilter.input(tempInst);
    m_NominalToBinaryFilter.batchFinished();
    tempInst = m_NominalToBinaryFilter.output();

    if (m_AttributeFilter != null) {
      m_AttributeFilter.input(tempInst);
      m_AttributeFilter.batchFinished();
      tempInst = m_AttributeFilter.output();
    }
    
    if (!m_center) {
      m_standardizeFilter.input(tempInst);
      m_standardizeFilter.batchFinished();
      tempInst = m_standardizeFilter.output();
    } else {
      m_centerFilter.input(tempInst);
      m_centerFilter.batchFinished();
      tempInst = m_centerFilter.output();
    }

    if (m_HasClass)
      newVals[m_OutputNumAtts - 1] = instance.value(instance.classIndex());

    if (m_MaxAttributes > 0)
      numAttsLowerBound = m_NumAttribs - m_MaxAttributes;
    else
      numAttsLowerBound = 0;
    if (numAttsLowerBound < 0)
      numAttsLowerBound = 0;
    
    cumulative = 0;
    for (i = m_NumAttribs - 1; i >= numAttsLowerBound; i--) {
      tempval = 0.0;
      for (j = 0; j < m_NumAttribs; j++)
    tempval += m_Eigenvectors[j][m_SortedEigens[i]] * tempInst.value(j);

      newVals[m_NumAttribs - i - 1] = tempval;
      cumulative += m_Eigenvalues[m_SortedEigens[i]];
      if ((cumulative / m_SumOfEigenValues) >= m_CoverVariance)
    break;
    }

    // create instance
    if (instance instanceof SparseInstance)
      result = new SparseInstance(instance.weight(), newVals);
    else
      result = new DenseInstance(instance.weight(), newVals);
    
    return result;
  }

  /**
   * Initializes the filter with the given input data.
   *
   * @param instances   the data to process
   * @throws Exception  in case the processing goes wrong
   * @see               #batchFinished()
   */
  protected void setup(Instances instances) throws Exception {
    int                i;
    int                j;
    Vector<Integer>         deleteCols;
    int[]             todelete;
    double[][]             v;
    Matrix             corr;
    EigenvalueDecomposition     eig;
    Matrix             V;
    
    m_TrainInstances = new Instances(instances);

    // make a copy of the training data so that we can get the class
    // column to append to the transformed data (if necessary)
    m_TrainCopy = new Instances(m_TrainInstances, 0);

    m_ReplaceMissingFilter = new ReplaceMissingValues();
    m_ReplaceMissingFilter.setInputFormat(m_TrainInstances);
    m_TrainInstances = Filter.useFilter(m_TrainInstances, m_ReplaceMissingFilter);

    m_NominalToBinaryFilter = new NominalToBinary();
    m_NominalToBinaryFilter.setInputFormat(m_TrainInstances);
    m_TrainInstances = Filter.useFilter(m_TrainInstances, m_NominalToBinaryFilter);

    // delete any attributes with only one distinct value or are all missing
    deleteCols = new Vector<Integer>();
    for (i = 0; i < m_TrainInstances.numAttributes(); i++) {
      if (m_TrainInstances.numDistinctValues(i) <= 1)
    deleteCols.addElement(i);
    }

    if (m_TrainInstances.classIndex() >=0) {
      // get rid of the class column
      m_HasClass = true;
      m_ClassIndex = m_TrainInstances.classIndex();
      deleteCols.addElement(new Integer(m_ClassIndex));
    }

    // remove columns from the data if necessary
    if (deleteCols.size() > 0) {
      m_AttributeFilter = new Remove();
      todelete = new int [deleteCols.size()];
      for (i = 0; i < deleteCols.size(); i++)
    todelete[i] = ((Integer)(deleteCols.elementAt(i))).intValue();
      m_AttributeFilter.setAttributeIndicesArray(todelete);
      m_AttributeFilter.setInvertSelection(false);
      m_AttributeFilter.setInputFormat(m_TrainInstances);
      m_TrainInstances = Filter.useFilter(m_TrainInstances, m_AttributeFilter);
    }

    // can evaluator handle the processed data ? e.g., enough attributes?
    getCapabilities().testWithFail(m_TrainInstances);

    m_NumInstances = m_TrainInstances.numInstances();
    m_NumAttribs   = m_TrainInstances.numAttributes();

    //fillCorrelation();
    fillCovariance();

    // get eigen vectors/values
    corr = new Matrix(m_Correlation);
    eig  = corr.eig();
    V    = eig.getV();
    v    = new double[m_NumAttribs][m_NumAttribs];
    for (i = 0; i < v.length; i++) {
      for (j = 0; j < v[0].length; j++)
        v[i][j] = V.get(i, j);
    }
    m_Eigenvectors = (double[][]) v.clone();
    m_Eigenvalues  = (double[]) eig.getRealEigenvalues().clone();

    // any eigenvalues less than 0 are not worth anything --- change to 0
    for (i = 0; i < m_Eigenvalues.length; i++) {
      if (m_Eigenvalues[i] < 0)
    m_Eigenvalues[i] = 0.0;
    }
    m_SortedEigens     = Utils.sort(m_Eigenvalues);
    m_SumOfEigenValues = Utils.sum(m_Eigenvalues);

    m_TransformedFormat = determineOutputFormat(m_TrainInstances);
    setOutputFormat(m_TransformedFormat);
    
    m_TrainInstances = null;
  }