new_host  split     sequence    expression
FALSE     train     AQVPYGVS    0.039267878
FALSE     train     ASVPYGVSI   0.039267878
FALSE     train     STNLYGSGR   0.261456561
FALSE     valid     NLYGSGLVR   0.265188519
FALSE     valid     SLGPSNLYG   0.419680588
FALSE     valid     ATSLGTTNG   0.145710993

我正在尝试计算模型( PLS回归模型)的共形预测,这是基于基于的共形函数计算预测间隔(目标是我的序列的表达式)我的校准数据。我的算法基于以下内容:

  1. 划分了我的数据集
  2. 使我的模型适合训练数据
  3. 将我的整合函数定义为预测标签和真实标签之间的绝对错误
  4. 将整合功能应用于我的校准数据集
  5. 从第4步计算出我的合格疮

现在,我需要根据显着性水平确定预测间隔。在计算数据集的间隔时遇到问题。我一直遇到不同的numpy错误,在这种情况下,我不确定如何进行处理,这种情况在我的上一个名为 conformal_predictions 的类方法中找到。我在哪里计算预测间隔。


def data_split(df):
    train = df.loc[df['split'] == 'train']
    valid = df.loc[df['split'] == 'valid']
    X_test = valid.iloc[:,:-1]
    y_test = valid.iloc[:,-1] 
    X_train = train.iloc[:,:-1] 
    y_train = train.iloc[:,-1] 
    X_train, X_cal, y_train, y_cal = train_test_split(X_train, y_train, test_size =0.2)
    print("Data has been split")
    print("X_train and y_train shape: "+ str(X_train.shape) + str(y_train.shape))
    print("X_cal and y_cal shape: "+ str(X_cal.shape) + str(y_cal.shape))
    print('{} instances, {} features, {} classes'.format(y_train.size,
    return X_test, y_test, X_train, y_train, X_cal, y_cal


class NonConformist():

def __init__(self, model):
    self.model = model

def underlying_fit (self, X_train, y_train):
        Train underlying model on proper training data
        X_train: has shape (n_train, n_features)
        y_train: has shape (n_train)
    print("Model has been fitted")
def calibration_predictions(self, X_cal):
        Obtain predictions from the underlying model using X_cal data. 
        Returns an output of predicted real values as numpy.array of shape (n_test)
    X_cal: numpy array has shape (n_train, n_features)
    calibration_predictions = self.model.predict(X_cal)

    print("Calibration Predictions Established")
    return calibration_predictions
def test_predictions(self, X_test):
        Obtain predictions from the underlying model using X_test data. 
        Returns an output of predicted real values as numpy.array of shape (n_test)
    X_test: numpy array has shape (n_train, n_features)
    test_predictions = self.model.predict(X_test)

    print("Calibration Predictions Established")
    return test_predictions
def calibration_scores(calibration_predictions, y_cal):
    Calculates absolute error nonconformity for calibration set.
   For each correct output in ``y``, nonconformity is defined as
   | y_i (predicted labels) - y^_i (true labels)|     
    true_labels is a numpy array of (true) labels 
    predictions is a numpy array of predicted labels'''
    true_labels = np.array(y_cal)
    calibration_scores = np.abs(calibration_predictions - true_labels)
    calibration_scores = np.sort(calibration_scores)[::-1] #sort in descending order
    print("Calibration Scores Obtained")
    return calibration_scores

def partial_inverse(self, calibration_scores, significance):
    This function is the partial inverse of the nonconformity function (calibration_scores) in order to
    calculate the prediction intervals where:
    apply_inverse(...)[0] is subtracted from the prediction of the
            underlying model to create the lower boundary of the
            prediction interval
        apply_inverse(...)[1] is added to the prediction of the
            underlying model to create the upper boundary of the
            prediction interval
    Significance is a float between 0-1 (i.e. 0.05)
    border = int(np.floor(significance * (calibration_scores.size + 1))) - 1
    border = min(max(border, 0), calibration_scores.size - 1)
    return np.vstack([calibration_scores[border], calibration_scores[border]])

def conformal_predictions(self, X_test, calibration_scores, significance, test_predictions):
    """This function creates the prediction intervals based from a set of test examples.
    This takes the predictions for each test pattern with the underlying model
    and applies the conformity function to each prediction, resulting in 
    a final prediction interval for each test pattern. 
    Predicts the output of each test pattern using the underlying model,
    and applies the (partial) inverse nonconformity function to each
    prediction, resulting in a prediction interval for each test pattern.
    X_test: consists of a numpy array of shape [n_samples, n_features]

    significance level : is a float between 0 and 1; determimned as 
    the maximum allowed error rate of predictions.
    p : numpy array of shape [n_samples, 2] or [n_samples, 2, 99]
    If significance is ``None``, then p contains the interval (minimum
    and maximum boundaries) for each test pattern, and each significance
    level (0.01, 0.02, ..., 0.99). 
    If significance value is a float between
    0 and 1, then p contains the prediction intervals (minimum and
    maximum boundaries) for the set of test patterns at the chosen
    significance level.
    n_test = X_test.shape[0]
    prediction = self.model.predict(x)
    norm = np.ones(n_test)

    if significance:
        intervals = np.zeros((x.shape[0], 2)) #creates empty 2D numpy array
        err_dist = self.err_func.partial_inverse(calibration_scores, significance)
        err_dist = np.hstack([err_dist] * n_test)
        err_dist *= norm

        intervals[:, 0] = test_predictions - err_dist[0, :]
        intervals[:, 1] = test_predictions + err_dist[1, :]

        return intervals
        significance = np.arange(0.01, 1.0, 0.01)
        intervals = np.zeros((x.shape[0], 2, significance.size))

        for i, s in enumerate(significance):
            err_dist = self.err_func.apply_inverse(nc, s)
            err_dist = np.hstack([err_dist] * n_test)
            err_dist *= norm

            intervals[:, 0, i] = prediction - err_dist[0, :]
            intervals[:, 1, i] = prediction + err_dist[0, :]

        return intervals

