データを観察し、回帰モデルを近似したいとします。残念ながら、は平均値がゼロ以外の誤差で測定される場合があります。
ましょうかどうかを示す、それぞれ古典的なゼロ平均誤差又は非ゼロ平均誤差で測定されます。を推定し。残念ながら、は通常観測されず、です。我々はの回帰合う場合は上の、我々は偏った予測を取得します。
一般的に観察することはできないが、モデルにアクセスできるとします(Zを小さなトレーニングセットで手動で学習し、Zをターゲット変数として分類モデルを近似したため)。 。\ Pr [Z = \ text {unbiased} \、| \、X、Y]を使用してXのYの回帰を当てはめますか?回帰の重みは\ mathbf {E} [Y \、| \、X、 Z = \ text {unbiased}](または、それに失敗すると、重みを使用しない場合よりもバイアスの少ない推定になります)?この方法は実際に使用されていますか、それとも名前がありますか?
明確化:目標は、である目に見えないデータ(テストデータ)の平均二乗誤差を最小化するモデルに適合することです。その目的の最適な予測子はなので、これが推定しようとしている関数です。この問題を解決する方法は、その目的をどれだけ達成できるかという点でランク付けする必要があります。
Rの小さな例df$y_is_unbiased
のロールプレイングとdf$y_observed
のロールプレイング:
library(ggplot2)
library(randomForest)
set.seed(12345)
get_df <- function(n_obs, constant, beta, sd_epsilon, mismeasurement) {
df <- data.frame(x1=rnorm(n_obs), x2=rnorm(n_obs), epsilon=rnorm(n_obs, sd=sd_epsilon))
## Value of Y if measured correctly
df$y_unbiased <- constant + as.matrix(df[c("x1", "x2")]) %*% beta + df$epsilon
## Value of Y if measured incorrectly
df$y_biased <- df$y_unbiased + sample(mismeasurement, size=n_obs, replace=TRUE)
## Y is equally likely to be measured correctly or incorrectly
df$y_is_unbiased<- sample(c(TRUE, FALSE), size=n_obs, replace=TRUE)
df$y_observed <- ifelse(df$y_is_unbiased, df$y_unbiased, df$y_biased)
return(df)
}
## True coefficients
constant <- 5
beta <- c(1, 5)
df <- get_df(n_obs=2000, constant=constant, beta=beta, sd_epsilon=1.0, mismeasurement=c(-10.0, 5.0))
ggplot(df, aes(x=x1, y=y_observed, color=y_is_unbiased)) + geom_point() + scale_color_manual(values=c("#ff7f00", "#377eb8"))
## For facet_wrap title
df$string_y_is_unbiased <- paste0("y_is_unbiased: ", df$y_is_unbiased)
## Notice that Pr[Y | Z = biased] differs from Pr[Y | Z = unbiased]
ggplot(df, aes(x=y_observed)) + geom_histogram(color="black", fill="grey", binwidth=0.5) + facet_wrap(~ string_y_is_unbiased, ncol=1)
## Recover true constant and beta (plus noise) when using y_unbiased
summary(lm(y_unbiased ~ x1 + x2, data=df))
## Biased estimates when using y_biased (constant is biased downward)
summary(lm(y_biased ~ x1 + x2, data=df))
## Also get biased estimates when using y_observed (constant is biased downward)
summary(lm(y_observed ~ x1 + x2, data=df))
## Now image that we "rate" subset of the data (manually check/research whether y was measured with or without bias)
n_rated <- 1000
df_rated <- df[1:n_rated, ]
## Use a factor so that randomForest does classification instead of regression
df_rated$y_is_unbiased <- factor(df_rated$y_is_unbiased)
model_pr_unbiased <- randomForest(formula=y_is_unbiased ~ y_observed + x1 + x2, data=df_rated, mtry=2)
## Examine OOB confusion matrix (error rate < 5%)
print(model_pr_unbiased)
## Use the model to get Pr[Y is unbiased | X, observed Y] on unrated data
df_unrated <- df[(n_rated+1):nrow(df), ]
df_unrated$pr_unbiased <- as.vector(predict(model_pr_unbiased, newdata=df_unrated, type="prob")[, "TRUE"])
## Train a model on unrated data, using pr_unbiased as regression weights -- is this unbiased?
summary(lm(y_observed ~ x1 + x2, data=df_unrated, weights=df_unrated$pr_unbiased))
この例では、モデルは、を持つランダムフォレストです。このモデルが完全に正確である場合、がバイアスされていない場合は1.0、がバイアスされている場合は0.0の重みが生成され、加重回帰は明らかにバイアスされません。のモデルにテストの精度があり、再現率が完全ではない(100%未満の精度)場合、どうなりますか?加重回帰は、上の非加重回帰よりもバイアスが少ないことが保証されていますか?Y Y Pr [ Z = 不偏formula=y_is_unbiased ~ y_observed + x1 + x2
がによって変化する少し複雑な例(上記で投稿した簡単な例である):
library(ggplot2)
library(randomForest)
set.seed(12345)
logistic <- function(x) {
return(1 / (1 + exp(-x)))
}
pr_y_is_unbiased <- function(x1, x2) {
## This function returns Pr[ Z = unbiased | X ]
return(logistic(x1 + 2*x2))
}
get_df <- function(n_obs, constant, beta, sd_epsilon, mismeasurement) {
df <- data.frame(x1=rnorm(n_obs), x2=rnorm(n_obs), epsilon=rnorm(n_obs, sd=sd_epsilon))
## Value of Y if measured correctly
df$y_unbiased <- constant + as.matrix(df[c("x1", "x2")]) %*% beta + df$epsilon
## Value of Y if measured incorrectly
df$y_biased <- df$y_unbiased + sample(mismeasurement, size=n_obs, replace=TRUE)
## Note: in this example, Pr[ Z = biased | X ] varies with X
## In the first (simpler) example I posted, Pr[ Z = biased | X ] = 1/2 was constant with respect to X
df$y_is_unbiased <- runif(n_obs) < pr_y_is_unbiased(df$x1, df$x2)
df$y_observed <- ifelse(df$y_is_unbiased, df$y_unbiased, df$y_biased)
return(df)
}
## True coefficients
constant <- 5
beta <- c(1, 5)
df <- get_df(n_obs=2000, constant=constant, beta=beta, sd_epsilon=1.0, mismeasurement=c(-10.0, 5.0))
ggplot(df, aes(x=x1, y=y_observed, color=y_is_unbiased)) + geom_point() + scale_color_manual(values=c("#ff7f00", "#377eb8"))
## For facet_wrap title
df$string_y_is_unbiased <- paste0("y_is_unbiased: ", df$y_is_unbiased)
## Notice that Pr[Y | Z = biased] differs from Pr[Y | Z = unbiased]
ggplot(df, aes(x=y_observed)) + geom_histogram(color="black", fill="grey", binwidth=0.5) + facet_wrap(~ string_y_is_unbiased, ncol=1)
## Recover true constant and beta (plus noise) when using y_unbiased
summary(lm(y_unbiased ~ x1 + x2, data=df))
## Biased estimates when using y_biased (constant is biased downward)
summary(lm(y_biased ~ x1 + x2, data=df))
## Also get biased estimates when using y_observed
## Note: the constant is biased downward _and_ the coefficient on x2 is biased upward!
summary(lm(y_observed ~ x1 + x2, data=df))
## Now image that we "rate" subset of the data (manually check/research whether y was measured with or without bias)
n_rated <- 1000
df_rated <- df[1:n_rated, ]
## Use a factor so that randomForest does classification instead of regression
df_rated$y_is_unbiased <- factor(df_rated$y_is_unbiased)
model_pr_unbiased <- randomForest(formula=y_is_unbiased ~ y_observed + x1 + x2, data=df_rated, mtry=2)
## Examine OOB confusion matrix (error rate < 5%)
print(model_pr_unbiased)
## Use the model to get Pr[Y is unbiased | X, observed Y] on unrated data
df_unrated <- df[(n_rated+1):nrow(df), ]
df_unrated$pr_unbiased <- as.vector(predict(model_pr_unbiased, newdata=df_unrated, type="prob")[, "TRUE"])
## Train a model on unrated data, using pr_unbiased as regression weights -- is this unbiased? If not, is it _less_ biased than the unweighted model?
summary(lm(y_observed ~ x1 + x2, data=df_unrated, weights=df_unrated$pr_unbiased))
## What happens if we use pr_unbiased as a feature (aka predictor) in the regression, rather than a weight?
## In this case the weighted regression seems to do better, but neither is perfect
## Note: copied from shabbychef's answer
summary(lm(formula = y_observed ~ x1 + x2 + I(1 - pr_unbiased), data = df_unrated))
この例では、の加重回帰上のルックスが少ない非加重回帰よりも偏っ。それは一般的に本当ですか?また、この例についてシャビーシェフの提案(以下の回答を参照)を試したところ、加重回帰よりもパフォーマンスが悪いようです。
RよりPythonを好む方のために、Pythonでの2番目のシミュレーションを示します。
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
def logistic(x):
return 1 / (1 + np.exp(-x))
def pr_y_is_unbiased(x1, x2):
# This function returns Pr[ Z = unbiased | X ]
return logistic(x1 + 2*x2)
def get_df(n_obs, constant, beta, sd_epsilon, mismeasurement):
df = pd.DataFrame({
'x1': np.random.normal(size=n_obs),
'x2': np.random.normal(size=n_obs),
'epsilon': np.random.normal(size=n_obs, scale=sd_epsilon),
})
df['y_unbiased'] = constant + np.dot(np.array(df[['x1', 'x2']]), beta) + df['epsilon']
# Note: df['y_biased'].mean() will differ from df['y_unbiased'].mean() if the mismeasurements have a nonzero mean
df['y_biased'] = df['y_unbiased'] + np.random.choice(mismeasurement, size=n_obs)
df['y_is_unbiased'] = np.random.uniform(size=n_obs) < pr_y_is_unbiased(df['x1'], df['x2'])
df['y_observed'] = df.apply(lambda row: row['y_unbiased'] if row['y_is_unbiased'] else row['y_biased'], axis=1)
return df
constant = 5
beta = np.array([1, 5])
print(f'true coefficients:\n constant = {constant}, beta = {beta}')
n_obs = 2000
# Note: the mean of the possible mismeasurements is nonzero (this is the source of the bias)
df = get_df(n_obs=n_obs, constant=constant, beta=beta, sd_epsilon=1.0, mismeasurement=[-10.0, 5.0])
lr = LinearRegression()
lr.fit(X=df[['x1', 'x2']], y=df['y_observed'])
print(f'estimates from unweighted regression of Y on X ({df.shape[0]} obs):\n constant = {lr.intercept_}, beta = {lr.coef_}')
# Note: pretend that we only observe y_is_unbiased on a "rated" subset of the data
n_rated = n_obs // 2
df_rated = df.iloc[:n_rated].copy()
df_unrated = df.iloc[n_rated:].copy()
rf = RandomForestClassifier(n_estimators=500, max_features=2, oob_score=True)
rf_predictors = ['y_observed', 'x1', 'x2']
rf.fit(X=df_rated[rf_predictors], y=df_rated['y_is_unbiased'])
print(f'random forest classifier OOB accuracy (for predicting whether Y is unbiased): {rf.oob_score_}')
df_unrated['pr_y_is_unbiased'] = rf.predict_proba(df_unrated[rf_predictors])[:, 1]
lr.fit(X=df_unrated[['x1', 'x2']], y=df_unrated['y_observed'], sample_weight=df_unrated['pr_y_is_unbiased'])
print(f'estimates from weighted regression of Y on X ({df_unrated.shape[0]} obs):\n constant = {lr.intercept_}, beta = {lr.coef_}')