基于机器学习的迁徙距离预测模型,结合了基因数据和气候环境数据。以下是代码的逐部分解释:
1. 导入库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import geopandas as gpd
import cartopy.crs as ccrs
- 数据处理:使用
pandas
和numpy
进行数据加载和数值计算。 - 机器学习模型:导入了线性回归、随机森林和神经网络回归模型。
- 数据分割与评估:使用
train_test_split
分割数据,r2_score
评估模型性能。 - 可视化:使用
matplotlib
和geopandas
绘制地图,cartopy
用于地理坐标投影。
2. 类定义:MigrationPredictionModel
class MigrationPredictionModel:
def __init__(self, gene_data_path, climate_data_path):
self.gene_data = self.load_gene_data(gene_data_path)
self.climate_data = self.load_climate_data(climate_data_path)
- 初始化方法:加载基因型数据(
gene_data.csv
)和气候数据(climate_scenarios.csv
)。
3. 数据加载方法
def load_gene_data(self, path):
"""加载基因型数据"""
data = pd.read_csv(path)
return data
def load_climate_data(self, path):
"""加载气候环境数据"""
data = pd.read_csv(path)
return data
- 假设:基因数据包含
population, adcy8_freq, adcy8_exp, mig_distance
;气候数据包含location, temp_change, env_heterogeneity, food_availability
。 - 注意:实际使用时需确保文件路径正确,且数据格式与代码假设一致。
4. 数据预处理:prepare_training_data
def prepare_training_data(self):
"""合并基因和环境数据"""
merged = pd.merge(self.gene_data, self.climate_data,
left_on='population', right_on='location')
X = merged[['adcy8_freq', 'adcy8_exp', 'temp_change', 'env_heterogeneity']]
y = merged['mig_distance']
return train_test_split(X, y, test_size=0.2, random_state=42)
- 功能:将基因数据与气候数据按种群(
population
)和位置(location
)合并。 - 特征与标签:
X
包含基因和环境特征,y
是迁徙距离(mig_distance
)。 - 数据分割:按 80% 训练集、20% 测试集划分。
5. 模型训练:train_models
def train_models(self):
X_train, X_test, y_train, y_test = self.prepare_training_data()
self.lr_model = LinearRegression().fit(X_train, y_train)
lr_pred = self.lr_model.predict(X_test)
print(f"Linear Regression R²: {r2_score(y_test, lr_pred):.3f}")
self.rf_model = RandomForestRegressor(n_estimators=100, random_state=42).fit(X_train, y_train)
rf_pred = self.rf_model.predict(X_test)
print(f"Random Forest R²: {r2_score(y_test, rf_pred):.3f}")
self.nn_model = MLPRegressor(hidden_layer_sizes=(50, 25), max_iter=1000, random_state=42).fit(X_train, y_train)
nn_pred = self.nn_model.predict(X_test)
print(f"Neural Network R²: {r2_score(y_test, nn_pred):.3f}")
self.best_model = self.rf_model if r2_score(y_test, rf_pred) > r2_score(y_test, nn_pred) else self.nn_model
- 模型选择:训练了线性回归、随机森林和神经网络模型。
- 评估:使用 R² 分数评估模型性能。
- 最佳模型:选择 R² 分数更高的随机森林或神经网络作为最终模型。
6. 迁徙距离预测:predict_migration
def predict_migration(self, adcy8_freq, adcy8_exp, scenario='SSP126'):
scenario_data = self.climate_data[self.climate_data['scenario'] == scenario]
scenario_data['adcy8_freq'] = adcy8_freq
scenario_data['adcy8_exp'] = adcy8_exp
X = scenario_data[['adcy8_freq', 'adcy8_exp', 'temp_change', 'env_heterogeneity']]
scenario_data['pred_distance'] = self.best_model.predict(X)
return scenario_data
- 功能:根据指定的气候情景(如
SSP126
或SSP585
)和基因特征预测迁徙距离。 - 注意:
adcy8_freq
和adcy8_exp
是用户输入的参数,需确保数据格式正确。
7. 风险评估:calculate_strategy_risk
def calculate_strategy_risk(self, threshold=5000):
scenario_data['risk'] = (scenario_data['pred_distance'] < threshold).astype(int) * 100
return scenario_data
- 功能:计算迁徙距离低于阈值(默认 5000)时的策略风险(百分比)。
8. 地图可视化:plot_prediction_map
def plot_prediction_map(self, scenario_data):
fig, ax = plt.subplots(figsize=(10, 6))
gdf = gpd.GeoDataFrame(scenario_data, geometry=gpd.points_from_xy(scenario_data['lon'], scenario_data['lat']))
gdf.plot(ax=ax, column='pred_distance', legend=True, cmap='viridis')
ax.set_title('Predicted Migration Distance')
plt.show()
- 功能:使用
geopandas
将预测数据绘制成地图,颜色表示迁徙距离。 - 问题:示例代码中假设存在
lon
和lat
列,实际需替换为真实经纬度数据。
9. 示例用法
model = MigrationPredictionModel('data/gene_data.csv', 'data/climate_data.csv')
model.train_models()
scenario_data = model.predict_migration(adcy8_freq=0.8, adcy8_exp=0.6, scenario='SSP126')
model.plot_prediction_map(scenario_data)
- 步骤:加载数据 → 训练模型 → 预测 → 可视化。
- 注意:需确保文件路径正确,且数据包含必要的列。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import geopandas as gpd
import cartopy.crs as ccrs
class MigrationPredictionModel:
def __init__(self, gene_data_path, climate_data_path):
self.gene_data = self.load_gene_data(gene_data_path)
self.climate_data = self.load_climate_data(climate_data_path)
def load_gene_data(self, path):
"""加载基因型数据"""
# 假设数据包含: population, adcy8_freq, adcy8_exp, mig_distance
data = pd.read_csv(path)
return data
def load_climate_data(self, path):
"""加载气候环境数据"""
# 包含未来气候情景:SSP126(控温1.5℃), SSP585(高排放)
# 列:location, temp_change, env_heterogeneity, food_availability
data = pd.read_csv(path)
return data
def prepare_training_data(self):
"""合并基因和环境数据"""
# 将基因数据与气候数据按种群和位置合并
merged = pd.merge(self.gene_data, self.climate_data,
left_on='population', right_on='location')
# 特征和标签
X = merged[['adcy8_freq', 'adcy8_exp', 'temp_change', 'env_heterogeneity']]
y = merged['mig_distance']
return train_test_split(X, y, test_size=0.2, random_state=42)
def train_models(self):
"""训练多种预测模型"""
X_train, X_test, y_train, y_test = self.prepare_training_data()
# 线性回归模型 (基线)
self.lr_model = LinearRegression()
self.lr_model.fit(X_train, y_train)
lr_pred = self.lr_model.predict(X_test)
print(f"Linear Regression R²: {r2_score(y_test, lr_pred):.3f}")
# 随机森林模型
self.rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
self.rf_model.fit(X_train, y_train)
rf_pred = self.rf_model.predict(X_test)
print(f"Random Forest R²: {r2_score(y_test, rf_pred):.3f}")
# 神经网络模型
self.nn_model = MLPRegressor(hidden_layer_sizes=(50, 25),
max_iter=1000, random_state=42)
self.nn_model.fit(X_train, y_train)
nn_pred = self.nn_model.predict(X_test)
print(f"Neural Network R²: {r2_score(y_test, nn_pred):.3f}")
# 选择最佳模型 (实际应用中需要更严谨的评估)
self.best_model = self.rf_model if r2_score(y_test, rf_pred) > r2_score(y_test, nn_pred) else self.nn_model
def predict_migration(self, adcy8_freq, adcy8_exp, scenario='SSP126'):
"""预测特定场景下的迁徙距离"""
# 获取指定气候场景的数据
scenario_data = self.climate_data[self.climate_data['scenario'] == scenario]
# 为所有位置创建基因特征
scenario_data['adcy8_freq'] = adcy8_freq
scenario_data['adcy8_exp'] = adcy8_exp
X = scenario_data[['adcy8_freq', 'adcy8_exp', 'temp_change', 'env_heterogeneity']]
# 预测并添加到数据中
scenario_data['pred_distance'] = self.best_model.predict(X)
return scenario_data
def calculate_strategy_risk(self, predictions, threshold=5000):
"""计算策略失效风险 (长距离迁徙不再有利)"""
# 当预测距离显著小于历史基准时风险增加
predictions['strategy_risk'] = np.where(
predictions['pred_distance'] < threshold,
(1 - predictions['pred_distance']/threshold) * 100,
0
)
return predictions
def plot_prediction_map(self, prediction_data):
"""绘制预测结果地图"""
# 加载世界地图数据
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# 创建基础地图
fig, ax = plt.subplots(figsize=(15, 10), subplot_kw={'projection': ccrs.PlateCarree()})
world.plot(ax=ax, color='lightgray', edgecolor='white')
# 绘制预测点 (实际应用中需要坐标数据)
for _, row in prediction_data.iterrows():
# 随机生成位置作为示例
lon = np.random.uniform(-180, 180)
lat = np.random.uniform(-90, 90)
# 按距离大小着色
color_intensity = min(1.0, row['pred_distance'] / 10000)
color = plt.cm.Reds(color_intensity)
size = max(20, row['pred_distance'] / 100) # 按距离缩放点大小
ax.scatter(lon, lat, s=size, color=color, edgecolor='black', alpha=0.7)
# 添加风险标注
if row['strategy_risk'] > 30:
ax.text(lon, lat, f"{row['strategy_risk']:.0f}%",
fontsize=8, ha='center', va='center',
bbox=dict(facecolor='white', alpha=0.6))
plt.title("Migration Distance Prediction based on ADCY8 Gene", fontsize=16)
plt.show()
# 示例用法
if __name__ == "__main__":
# 初始化模型 (需要真实数据文件路径)
model = MigrationPredictionModel("gene_data.csv", "climate_scenarios.csv")
# 训练模型
model.train_models()
# 预测特定种群在两种气候场景下的迁徙
predictions_ssp126 = model.predict_migration(
adcy8_freq=0.75, # ADCY8基因频率
adcy8_exp=1.2, # ADCY8表达水平
scenario='SSP126' # 控温1.5℃情景
)
predictions_ssp585 = model.predict_migration(
adcy8_freq=0.75,
adcy8_exp=1.2,
scenario='SSP585' # 高排放情景
)
# 计算策略风险
predictions_ssp126 = model.calculate_strategy_risk(predictions_ssp126)
predictions_ssp585 = model.calculate_strategy_risk(predictions_ssp585)
# 输出高风险区域
high_risk_126 = predictions_ssp126[predictions_ssp126['strategy_risk'] > 50]
high_risk_585 = predictions_ssp585[predictions_ssp585['strategy_risk'] > 50]
print(f"SSP126高风险区域数量: {len(high_risk_126)}")
print(f"SSP585高风险区域数量: {len(high_risk_585)}")
# 可视化预测结果
model.plot_prediction_map(predictions_ssp126)
model.plot_prediction_map(predictions_ssp585)