我编写了一个Java程序,用于分析.soft基因表达式数据文件并将其写入txt
package il.ac.tau.cs.sw1.bioinformatics;
import org.apache.commons.math3.stat.inference.TestUtils;
import java.io.*;
import java.util.Arrays;
/**
*
* Gene Expression Analyzer
*
* Command line arguments:
* args[0] - GeoDatasetName: Gene expression dataset name (expects a corresponding
input file in SOFT format to exist in the local directory).
* args[1] - Label1: Label of the first sample subset
* args[2] - Label2: Label of the second sample subset
* args[3] - Alpha: T-test confidence level : only genes with pValue below this
threshold will be printed to output file
*
* Execution example: GeneExpressionAnalyzer GDS4085 "estrogen receptor-negative" "estrogen receptor-positive" 0.01
*
* @author software1-2014
*
*/
public class GeneExpressionAnalyzer {
public static void main(String args[]) throws IOException {
// Reads the dataset from a SOFT input file
String inputSoftFileName = args[0] + ".soft";
GeneExpressionDataset geneExpressionDataset = parseGeneExpressionFile (inputSoftFileName);
System.out.printf ("Gene expression dataset loaded from file %s. %n",inputSoftFileName);
System.out.printf("Dataset contains %d samples and %d gene probes.%n%n",geneExpressionDataset.samplesNumber, geneExpressionDataset.genesNumber);
// Writes the dataset to a tabular format
String tabularFileName = args[0] + "-Tabular.txt";
writeDatasetToTabularFile(geneExpressionDataset,tabularFileName);
System.out.printf ("Dataset saved to tabular file - %s.%n%n",tabularFileName);
// Identifies differentially expressed genes between two sample groups and writes the results to a text file
String label1 = args[1];
String label2 = args[2];
double alpha = Double.parseDouble(args[3]);
String diffGenesFileName = args[0] + "-DiffGenes.txt";
int numOfDiffGenes = writeTopDifferentiallyExpressedGenesToFile(diffGenesFileName,geneExpressionDataset, alpha, label1, label2);
System.out.printf ("%d differentially expressed genes identified using alpha of %f when comparing the two sample groups [%s] and [%s].%n",numOfDiffGenes, alpha, label1, label2);
System.out.printf ("Results saved to file %s.%n",diffGenesFileName);
}
private static float[] StringtoFloat(String[] temp) {
float[] array = new float[temp.length];
for (int i = 0; i < temp.length; i++){
array[i]= Float.parseFloat(temp[i]);
}
return array;
}
private static double[] CutToCounter(double[] array, int counter) {
if (array.length == counter){
return array;
}
double[] args = new double[counter+1];
for (int i = 0; i < args.length; i++){
args[i] = array[i];
}
return args;
}
private static int min(double[] pValues) {
double val = 2;
int index = -1;
for (int i = 0; i < pValues.length; i++){
if (pValues[i] < val && pValues[i] != 3.0){
val = pValues[i];
index = i;
}
}
return index;
}
private static String changeformat(float[] array) {
String[] args = new String[array.length];
for (int i = 0; i < array.length; i++){
args[i] = String.format("%.2f", array[i]);
}
return Arrays.toString(args);
}
/**
*
* parseGeneExpressionFile - parses the given SOFT file
*
*
* @param filename A gene expression file in SOFT format
* @return a GeneExpressionDataset object storing all data parsed from the input file
* @throws IOException
*/
public static GeneExpressionDataset parseGeneExpressionFile (String filename) throws IOException {
GeneExpressionDataset dataset = new GeneExpressionDataset();
BufferedReader buf = new BufferedReader(new FileReader(filename));
String line = buf.readLine();
String[] geneids = null;
String[] genesymbols = null;
float[][] datamatrix = null;
String[][] subsetinfo = new String[10][2];
String[][] subsetsample = new String[10][];
int i = 0;
int j = 0;
boolean bol = false;
while (line != null){
if (line.startsWith("!dataset_sample_count")){
dataset.samplesNumber = Integer.parseInt(line.substring(24));
}
else if (line.startsWith("!dataset_sample_count")){
dataset.genesNumber = Integer.parseInt(line.substring(25));
geneids = new String[dataset.genesNumber];
genesymbols = new String[dataset.genesNumber];
}
else if (line.startsWith("^SUBSET")){
subsetinfo[i][0] = line.substring(10);
i++;
}
else if (line.startsWith("!subset_sample_description")){
subsetinfo[i][1] = line.substring(22);
}
else if (line.startsWith("!subset_sample_id")){
subsetsample[i-1] = line.substring(20).split(",");
}
else if (line.startsWith("!dataset_table_begin")){
datamatrix = new float[dataset.genesNumber][dataset.samplesNumber];
}
else if (line.startsWith("ID_REF")){
String[] array1 = line.split("\t");
dataset.sampleIds = (String[]) Arrays.copyOfRange(array1, 2, array1.length);
bol = true;
}
else if (bol && !line.startsWith("!dataset_table_end")){
String[] array2 = line.split("\t");
geneids[j] = array2[0];
genesymbols[j] = array2[1];
String[] temp = (String[]) Arrays.copyOfRange(array2, 2, array2.length);
datamatrix[j] = StringtoFloat(temp);
j++;
}
}
buf.close();
dataset.geneIds = geneids;
dataset.geneSymbols = genesymbols;
dataset.dataMatrix = datamatrix;
String[] lables = new String[dataset.samplesNumber];
int k = 0;
for (String sample : dataset.sampleIds) {
for (int m = 0; m < subsetsample.length; m++) {
if (Arrays.binarySearch(subsetsample[m], sample) != -1) {
lables[k] = subsetsample[m][1];
k += 1;
} else {
continue;
}
}
}
dataset.labels = lables;
return dataset;
}
/**
* writeDatasetToTabularFile
* writes the dataset to a tabular text file
*
* @param geneExpressionDataset
* @param outputFilename
* @throws IOException
*/
public static void writeDatasetToTabularFile(GeneExpressionDataset geneExpressionDataset, String outputFilename) throws IOException {
File NewFile = new File(outputFilename);
BufferedWriter buf = new BufferedWriter(new FileWriter(NewFile));
String Lables = "\t" + "\t" + "\t" + "\t" + Arrays.toString(geneExpressionDataset.labels);
String Samples = "\t" + "\t" + "\t" + "\t" + Arrays.toString(geneExpressionDataset.sampleIds);
buf.write(Lables + "\r\n" + Samples + "\r\n");
for (int i = 0; i < geneExpressionDataset.genesNumber; i++){
buf.write(geneExpressionDataset.geneIds[i] + "\t"+ geneExpressionDataset.geneSymbols[i] + "\t" +
changeformat(geneExpressionDataset.dataMatrix[i]) + "\r\n");
}
buf.close();
}
/**
*
* writeTopDifferentiallyExpressedGenesToFile
*
* @param outputFilename
* @param geneExpressionDataset
* @param alpha
* @param label1
* @param label2
* @return numOfDiffGenes The number of differentially expressed genes detected, having p-value lower than alpha
* @throws IOException
*/
public static int writeTopDifferentiallyExpressedGenesToFile(String outputFilename,
GeneExpressionDataset geneExpressionDataset, double alpha,
String label1, String label2) throws IOException {
double pValues[] = new double[geneExpressionDataset.genesNumber];
int counter = 0;
for (int i = 0; i < pValues.length; i++){
double pval = calcTtest(geneExpressionDataset, i, label1, label2);
if (pval < alpha){
pValues[i] = pval;
counter++;
}
else{
continue;
}
}
File tofile = new File(outputFilename);
BufferedWriter buf = new BufferedWriter(new FileWriter(tofile));
int j = 0;
while (min(pValues) != -1){
String PVal = String.format("%.6f", pValues[min(pValues)]);
String gene_id = geneExpressionDataset.geneIds[min(pValues)];
String gene_symbol = geneExpressionDataset.geneSymbols[min(pValues)];
String line = String.valueOf(j) + "\t" + PVal + "\t" + gene_id + "\t" + gene_symbol;
buf.write(line + "\r\n");
pValues[min(pValues)] = 3.0;
j++;
}
buf.close();
return counter;
}
/**
*
* getDataEntriesForLabel
*
* Returns the entries in the 'data' array for which the corresponding entries in the 'labels' array equals 'label'
*
* @param data
* @param labels
* @param label
* @return
*/
public static double[] getDataEntriesForLabel(float[] data, String[] labels, String label) {
double[] array = new double[data.length];
int counter = 0;
for (int i = 0; i < data.length; i++){
if (labels[i].equals(label)){
array[counter] = data[i];
counter++;
}
else{
continue;
}
}return CutToCounter(array, counter);
}
/**
* calcTtest - returns a pValue for the t-Test
*
* Returns the p-value, associated with a two-sample, two-tailed t-test comparing the means of the input arrays
*
* //http://commons.apache.org/proper/commons-math/apidocs/org/apache/commons/math3/stat/inference/TTest.html#tTest(double[], double[])
*
* @param geneExpressionDataset
* @param geneIndex
* @param label1
* @param label2
* @return
*/
private static double calcTtest(GeneExpressionDataset geneExpressionDataset, int geneIndex, String label1, String label2) {
double[] sample1 = getDataEntriesForLabel(geneExpressionDataset.dataMatrix[geneIndex], geneExpressionDataset.labels, label1);
double[] sample2 = getDataEntriesForLabel(geneExpressionDataset.dataMatrix[geneIndex], geneExpressionDataset.labels, label2);
return TestUtils.tTest(sample1, sample2);
}
/**
*
* GeneExpressionDataset
* A class representing a gene expression dataset
*
* @author software1-2014
*
*/
public static class GeneExpressionDataset {
public int samplesNumber; //number of dataset samples
public int genesNumber; // number of dataset gene probes
public String[] sampleIds; //sample ids
public String[] geneIds; //gene probe ids
public String[] geneSymbols; //gene symbols
public float[][] dataMatrix; //expression data matrix
public String[] labels; //sample labels
}
}
现在,它不会编译,错误信息是这样的: &#34; GeneExpressionAnalyzer.java:2:错误:包org.apache.commons.math3.stat.inference不存在
import org.apach.commons.math3.stat.interference.TestUtils;
GeneExpressionAnalyzer.java:277:错误:找不到符号 return TestUtils.tTest; 符号:变量TestUtils location:类GeneExpressionAnalyzer 2个错误&#34;
我不知道出了什么问题,显然我已经添加了包含TestUtils路径的.jar文件。 (这里是:http://apache.spd.co.il//commons/math/binaries/commons-math3-3.2-bin.zip)
任何见解?
答案 0 :(得分:1)
如果您正在使用eclipse,
从here
手动下载jar文件在eclipse之后打开package explorer
- &gt;右键单击您的项目
Build Path
- &gt; Configure Build Path
,将打开一个窗口。
在Libraries
标签下 - &gt;点击Add External JARs
。选择已下载的jar文件单击“确定”。
这就是全部。现在问题可能已经消失
答案 1 :(得分:0)
它是否可以在命令行中运行?
我已将课程缩短为
import org.apache.commons.math3.stat.inference.TestUtils;
import java.io.*;
import java.util.Arrays;
public class Test {
public static void main(String args[]) throws IOException {
System.out.printf ("test...");
}
}
我将Test.java文件和commons-math3-3.2.jar复制到同一目录,这是命令行的输出:
C:\temp\test>dir
Répertoire de C:\temp\test
24/04/2014 14:41 <REP> .
24/04/2014 14:41 <REP> ..
24/04/2014 14:38 1 692 782 commons-math3-3.2.jar
24/04/2014 14:41 230 Test.java
2 fichier(s) 1 693 012 octets
2 Rép(s) 23 170 342 912 octets libres
C:\temp\test>javac Test.java
Test.java:1: package org.apache.commons.math3.stat.inference does not exist
import org.apache.commons.math3.stat.inference.TestUtils;
^
1 error
C:\temp\test>javac -cp commons-math3-3.2.jar Test.java
C:\temp\test>dir
Répertoire de C:\temp\test
24/04/2014 14:41 <REP> .
24/04/2014 14:41 <REP> ..
24/04/2014 14:38 1 692 782 commons-math3-3.2.jar
24/04/2014 14:41 500 Test.class
24/04/2014 14:41 230 Test.java