Интеграция BigQuery ML для аналитики в Google Cloud
BigQuery ML позволяет обучать модели прямо в SQL без перемещения данных в отдельный ML-кластер. CREATE MODEL, TRANSFORM, ML.PREDICT — всё через стандартный SQL интерфейс. Интеграция с Vertex AI добавляет более сложные модели (AutoML, custom containers) при сохранении единого data layer.
Базовые модели через SQL
-- Логистическая регрессия для churn prediction
CREATE OR REPLACE MODEL `project.ml_models.churn_model`
OPTIONS(
model_type='LOGISTIC_REG',
input_label_cols=['churned'],
l2_reg=0.1,
max_iterations=50,
data_split_method='AUTO_SPLIT',
enable_global_explain=TRUE
) AS
SELECT
user_id,
days_since_last_session,
avg_session_duration_sec,
purchases_last_30d,
support_tickets_count,
subscription_months,
churned
FROM `project.features.user_churn_training`
WHERE split_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 90 DAY);
-- Оценка модели
SELECT *
FROM ML.EVALUATE(MODEL `project.ml_models.churn_model`,
(SELECT * FROM `project.features.user_churn_test`))
-- Предсказания
SELECT
u.user_id,
pred.predicted_churned,
pred.predicted_churned_probs[OFFSET(1)].prob as churn_probability
FROM ML.PREDICT(MODEL `project.ml_models.churn_model`,
(SELECT * FROM `project.features.users_current`)) pred
JOIN `project.raw.users` u USING(user_id)
WHERE pred.predicted_churned_probs[OFFSET(1)].prob > 0.7
ORDER BY churn_probability DESC;
-- Feature importance через SHAP
SELECT *
FROM ML.GLOBAL_EXPLAIN(MODEL `project.ml_models.churn_model`);
Gradient Boosted Trees и AutoML Tables
-- Gradient Boosted Trees (XGBoost под капотом)
CREATE OR REPLACE MODEL `project.ml_models.revenue_forecast`
OPTIONS(
model_type='BOOSTED_TREE_REGRESSOR',
num_parallel_tree=1,
max_tree_depth=6,
subsample=0.8,
colsample_bytree=0.8,
learn_rate=0.05,
max_iterations=200,
early_stop=TRUE,
min_rel_progress=0.001,
data_split_method='RANDOM',
data_split_eval_fraction=0.2,
input_label_cols=['revenue_next_30d']
) AS
SELECT * EXCEPT(user_id, split_date)
FROM `project.features.revenue_training`;
-- Time Series Forecasting с ARIMA_PLUS
CREATE OR REPLACE MODEL `project.ml_models.sales_forecast`
OPTIONS(
model_type='ARIMA_PLUS',
time_series_timestamp_col='date',
time_series_data_col='daily_revenue',
holiday_region='RU',
auto_arima=TRUE,
data_frequency='DAILY'
) AS
SELECT date, daily_revenue
FROM `project.analytics.daily_revenue`
WHERE date >= '2023-01-01'
ORDER BY date;
-- Прогноз на 30 дней вперёд
SELECT *
FROM ML.FORECAST(MODEL `project.ml_models.sales_forecast`,
STRUCT(30 AS horizon, 0.9 AS confidence_level));
Python в BigQuery через Colab Enterprise
# BigQuery DataFrame API (pandas-compatible)
import bigframes.pandas as bpd
from bigframes.ml.ensemble import RandomForestClassifier
from bigframes.ml.pipeline import Pipeline
from bigframes.ml.preprocessing import StandardScaler
bpd.options.bigquery.project = "your-project"
bpd.options.bigquery.location = "EU"
# Загрузка данных — работаем с BigQuery как с pandas
df = bpd.read_gbq("SELECT * FROM `project.features.training_data`")
# Train/test split
train_df, test_df = df.train_test_split(test_size=0.2, random_state=42)
X_train = train_df.drop(columns=["label"])
y_train = train_df["label"]
# BigFrames ML Pipeline
pipeline = Pipeline([
("scaler", StandardScaler()),
("model", RandomForestClassifier(n_estimators=100, random_state=42))
])
pipeline.fit(X_train, y_train)
# Оценка
from bigframes.ml.metrics import accuracy_score
predictions = pipeline.predict(test_df.drop(columns=["label"]))
accuracy = accuracy_score(test_df["label"], predictions)
print(f"Accuracy: {accuracy:.4f}")
# Сохранение в BigQuery ML Registry
pipeline.to_gbq("project.ml_models.rf_classifier")
Vertex AI Pipelines + BigQuery
from google.cloud import bigquery, aiplatform
from kfp import dsl
from kfp.v2.google.cloud import bigquery as kfp_bq
@dsl.pipeline(name="bq-ml-pipeline", pipeline_root="gs://ml-artifacts/pipelines")
def bq_ml_training_pipeline(
project: str,
dataset: str,
model_name: str
):
# Шаг 1: Подготовка данных
extract_op = kfp_bq.BigqueryQueryJobOp(
project=project,
location="EU",
query=f"""
CREATE OR REPLACE TABLE `{project}.{dataset}.training_features` AS
SELECT * FROM `{project}.features.user_features`
WHERE dt >= DATE_SUB(CURRENT_DATE(), INTERVAL 60 DAY)
"""
)
# Шаг 2: Обучение модели
train_op = kfp_bq.BigqueryCreateModelJobOp(
project=project,
location="EU",
query=f"""
CREATE OR REPLACE MODEL `{project}.{dataset}.{model_name}`
OPTIONS(model_type='BOOSTED_TREE_CLASSIFIER', input_label_cols=['label'])
AS SELECT * EXCEPT(user_id) FROM `{project}.{dataset}.training_features`
"""
).after(extract_op)
# Шаг 3: Оценка и регистрация
evaluate_op = kfp_bq.BigqueryEvaluateModelJobOp(
project=project,
location="EU",
model=train_op.outputs["model"]
).after(train_op)
# Запуск пайплайна
aiplatform.init(project="your-project", location="europe-west4")
job = aiplatform.PipelineJob(
display_name="bq-ml-pipeline",
template_path="pipeline.json",
parameter_values={"project": "your-project", "dataset": "ml", "model_name": "churn_v2"}
)
job.run()
Стоимость vs производительность
| Сценарий | Объём данных | BQ ML | Vertex AI Custom |
|---|---|---|---|
| Logistic Regression | 10M строк | ~$1-2 | ~$5-10 |
| Gradient Boosting | 100M строк | ~$15-30 | ~$20-40 |
| Time Series | 1M точек | ~$2-5 | ~$10-20 |
| AutoML Tables | 10M строк | N/A | ~$20-50 |
BigQuery ML оптимален для SQL-команд с данными уже в GCP. Порог переключения на Vertex AI Custom: необходимость в нестандартных архитектурах (трансформеры, кастомные loss функции) или требования latency < 10ms для онлайн-инференса.







