Development of an AI system for predictive network maintenance
A telecom network consists of thousands of active components: base stations, switches, routers, and optical amplifiers. Preventative, scheduled component replacement is more expensive than predictive replacement: up to 60% of replacements occur early. The ML system predicts failures based on KPI trends and SNMP telemetry.
Network element telemetry
Data sources for predictive analysis:
data_sources = {
'snmp_traps': {
'protocol': 'SNMP v2c/v3',
'frequency': 'event-driven + 5-min polling',
'examples': ['linkDown', 'authenticationFailure', 'cpuThreshold']
},
'netflow_ipfix': {
'measures': 'flow statistics, traffic matrix',
'frequency': '1-min aggregates'
},
'syslog': {
'content': 'structured error/warning messages',
'volume': '10k-100k events/hour на medium network'
},
'performance_counters': {
'for_base_stations': ['RSSI', 'SINR', 'handover_success_rate', 'RRC_setup_failure'],
'for_routers': ['cpu_util', 'memory_util', 'interface_error_rate', 'bgp_route_flaps'],
'for_optical': ['optical_power_dbm', 'chromatic_dispersion', 'OSNR']
}
}
Network element degradation models
Predictive model for base stations:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
def build_bs_failure_predictor(training_data: pd.DataFrame) -> LGBMClassifier:
"""
Предсказание отказа базовой станции за 7 дней.
Признаки: тренды KPI за 7/14/30 дней + hardware counters.
"""
feature_groups = {
# Тренды KPI
'kpi_trends': [
'rssi_trend_7d', 'sinr_trend_7d', 'handover_sr_trend_7d',
'rrc_failures_trend_7d', 'vswr_trend_7d' # antenna mismatch
],
# Статистика оборудования
'hw_metrics': [
'cpu_util_avg_30d', 'cpu_util_max_7d',
'memory_util_avg_30d', 'temperature_max_30d',
'fan_speed_deviation', # аномальная скорость вентилятора
'power_consumption_trend'
],
# История событий
'event_history': [
'alarm_count_7d', 'critical_alarm_count_30d',
'restart_count_90d', 'hw_error_count_7d'
],
# Контекст
'context': [
'age_years', 'vendor_encoded', 'climate_zone',
'traffic_load_avg_30d' # высоконагруженные стареют быстрее
]
}
all_features = [f for group in feature_groups.values() for f in group]
model = LGBMClassifier(
n_estimators=300,
learning_rate=0.05,
scale_pos_weight=15, # ~6% BS имеют отказы в 30-дневном окне
metric='average_precision'
)
model.fit(
training_data[all_features],
training_data['failure_in_7d']
)
return model
Feature Engineering for Trends:
def compute_kpi_trends(kpi_series: pd.Series, windows=[7, 14, 30]) -> dict:
"""
Slope тренда + волатильность за несколько окон
"""
trends = {}
for w in windows:
recent = kpi_series.tail(w)
if len(recent) >= 3:
x = np.arange(len(recent))
slope, intercept = np.polyfit(x, recent.values, 1)
trends[f'slope_{w}d'] = slope
trends[f'std_{w}d'] = recent.std()
trends[f'mean_{w}d'] = recent.mean()
trends[f'min_{w}d'] = recent.min()
return trends
Optical transport (DWDM)
Optical Path Degradation Monitoring:
def analyze_optical_degradation(optical_samples: pd.DataFrame,
channel_id: str) -> dict:
"""
DWDM каналы: рост дисперсии или снижение OSNR предшествует битовым ошибкам.
Тренд за 30 дней → прогноз превышения порога.
"""
channel_data = optical_samples[optical_samples['channel_id'] == channel_id].sort_index()
# Тренд OSNR
osnr_trend = compute_kpi_trends(channel_data['osnr_db'])['slope_30d']
current_osnr = channel_data['osnr_db'].iloc[-1]
# Прогноз: через сколько дней OSNR упадёт ниже порога (15 дБ для 100G)
osnr_threshold = 15.0
if osnr_trend < 0:
days_to_threshold = (current_osnr - osnr_threshold) / abs(osnr_trend)
else:
days_to_threshold = float('inf')
# Оптическая мощность
power_deviation = abs(channel_data['rx_power_dbm'].iloc[-1] -
channel_data['rx_power_dbm'].mean())
return {
'channel_id': channel_id,
'current_osnr': current_osnr,
'osnr_trend_db_per_day': osnr_trend,
'days_to_osnr_threshold': round(days_to_threshold, 1),
'power_deviation_db': round(power_deviation, 2),
'maintenance_recommended': days_to_threshold < 14 or power_deviation > 3
}
Classification of failure types
Multiclass model + interpretation:
from sklearn.ensemble import RandomForestClassifier
import shap
failure_types = [
'hardware_failure', # физический отказ компонента
'software_crash', # баг ПО / переполнение памяти
'overload', # перегрузка трафиком
'configuration_error', # человеческий фактор
'power_issue', # проблема питания
'optical_degradation' # деградация оптики
]
def classify_failure_type(fault_features: pd.DataFrame) -> dict:
"""
Помогает dispatch команде: software_crash → удалённый rebоot,
hardware_failure → выезд инженера.
"""
model = RandomForestClassifier(n_estimators=200, class_weight='balanced')
# (в реальности модель предварительно обучена)
probabilities = model.predict_proba([fault_features.values])[0]
predicted_class = failure_types[np.argmax(probabilities)]
dispatch_recommendation = {
'hardware_failure': 'field_engineer_required',
'software_crash': 'remote_reboot_and_monitoring',
'overload': 'traffic_rerouting_capacity_upgrade',
'configuration_error': 'rollback_config_change',
'power_issue': 'check_ups_and_power_supply',
'optical_degradation': 'schedule_fiber_inspection'
}
return {
'failure_type': predicted_class,
'confidence': float(max(probabilities)),
'dispatch': dispatch_recommendation[predicted_class],
'probabilities': dict(zip(failure_types, probabilities.tolist()))
}
Integration with NOC
Ticket prioritization: Risk score = P(failure) × business_impact × customer_count. The top 20 risk items are prioritized in the NOC queue. Integration with ServiceNow, Remedy, and Jira Service Management via REST API.
Deadlines: SNMP telemetry + basic anomaly + NOC alerts — 3-4 weeks. LightGBM failure predictor, optical monitoring, failure type classification, CMDB integration — 2-3 months.







