AI Lead Scoring: Building Models That Identify Your Best Prospects
How to train ML models on your CRM data to automatically score and prioritize leads
AI Lead Scoring: Building Models That Identify Your Best Prospects
How to train ML models on your CRM data to automatically score and prioritize leads
Learn how to build custom AI lead scoring systems that analyze behavioral signals, firmographic data, and engagement patterns to identify high-value prospects — helping sales teams focus effort on leads most likely to convert.
AI Lead Scoring: Identifying Your Best Prospects Automatically
Sales teams waste enormous time on leads that will never buy. AI lead scoring changes this — prioritizing leads based on actual signals that predict conversion, not arbitrary rules.
The Problem with Traditional Lead Scoring
Most companies use point-based scoring:
This is better than nothing, but it has fundamental problems:
Building a Behavioral ML Lead Scoring Model
python
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, precision_recall_curve
import jsonclass LeadScoringModel:
"""
Predicts lead-to-opportunity conversion probability.
Train on historical leads with known outcomes.
"""
def __init__(self):
self.model = GradientBoostingClassifier(
n_estimators=200, max_depth=4, learning_rate=0.05,
subsample=0.8, random_state=42
)
self.scaler = StandardScaler()
self.feature_columns = None
self.conversion_threshold = 0.5 # Optimized during training
def build_feature_matrix(self, leads_df: pd.DataFrame) -> pd.DataFrame:
"""
Build comprehensive feature set from lead data.
Input: DataFrame with lead behavioral and firmographic data
Output: Feature matrix for ML model
"""
features = pd.DataFrame()
# ===== FIRMOGRAPHIC FIT =====
# Company size
size_map = {'1-10': 1, '11-50': 2, '51-200': 3, '201-500': 4,
'501-1000': 5, '1001-5000': 6, '5000+': 7}
features['company_size_score'] = leads_df.get('company_size', '51-200').map(size_map).fillna(3)
# Industry fit (score based on your historical conversion by industry)
industry_fit = {
'Technology': 0.35, 'Financial Services': 0.28, 'Healthcare': 0.25,
'Retail': 0.18, 'Manufacturing': 0.20, 'Education': 0.15
}
features['industry_fit_score'] = leads_df.get('industry', 'Other').map(industry_fit).fillna(0.15)
# Job title/seniority
decision_maker_titles = ['CEO', 'CTO', 'VP', 'Director', 'Head of', 'Chief']
features['is_decision_maker'] = leads_df.get('job_title', '').apply(
lambda t: int(any(title in str(t) for title in decision_maker_titles))
)
# Technology stack fit (if you know what tech they use)
features['tech_stack_score'] = leads_df.get('tech_stack_fit_score', 0.5).fillna(0.5)
# ===== BEHAVIORAL ENGAGEMENT =====
# Email engagement
features['email_open_rate'] = leads_df.get('email_open_rate', 0).fillna(0)
features['email_click_rate'] = leads_df.get('email_click_rate', 0).fillna(0)
features['email_count'] = leads_df.get('emails_received', 0).fillna(0)
# Website behavior
features['page_views'] = leads_df.get('total_page_views', 0).fillna(0)
features['pricing_page_views'] = leads_df.get('pricing_page_views', 0).fillna(0)
features['case_study_views'] = leads_df.get('case_study_views', 0).fillna(0)
features['careers_page_views'] = leads_df.get('careers_page_views', 0).fillna(0) # NEGATIVE signal
features['demo_page_visits'] = leads_df.get('demo_page_visits', 0).fillna(0)
# Content engagement
features['content_downloads'] = leads_df.get('content_downloads', 0).fillna(0)
features['webinar_attendance'] = leads_df.get('webinars_attended', 0).fillna(0)
# Intent signals
features['free_trial_started'] = leads_df.get('free_trial_started', False).fillna(False).astype(int)
features['demo_requested'] = leads_df.get('demo_requested', False).fillna(False).astype(int)
features['pricing_requested'] = leads_df.get('pricing_requested', False).fillna(False).astype(int)
# ===== TIMING SIGNALS =====
features['days_since_first_touch'] = leads_df.get('days_since_first_touch', 30).fillna(30)
features['days_since_last_activity'] = leads_df.get('days_since_last_activity', 7).fillna(7)
# Velocity (recent vs early engagement)
features['recent_activity_score'] = leads_df.get('activity_last_14d', 0).fillna(0)
features['engagement_velocity'] = (
features['recent_activity_score'] / (features['days_since_first_touch'] / 14 + 1)
)
# ===== SOURCE QUALITY =====
source_conversion_rates = {
'Referral': 0.40, 'Organic Search': 0.25, 'Paid Search': 0.18,
'Content/Inbound': 0.22, 'Outbound SDR': 0.12, 'Social': 0.08,
'Event': 0.30, 'Partner': 0.35
}
features['source_quality'] = leads_df.get('lead_source', 'Other').map(
source_conversion_rates
).fillna(0.15)
# ===== COMPOSITE SCORES =====
# High-intent composite
features['high_intent_score'] = (
features['demo_requested'] * 3 +
features['pricing_page_views'] * 2 +
features['free_trial_started'] * 4 +
features['pricing_requested'] * 3 -
features['careers_page_views'] # Negative signal
)
return features.fillna(0)
def find_optimal_threshold(self, X_val, y_val) -> float:
"""Find the probability threshold that maximizes F1."""
probs = self.model.predict_proba(X_val)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_val, probs)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)
optimal_idx = np.argmax(f1_scores)
return thresholds[optimal_idx] if optimal_idx < len(thresholds) else 0.5
def score_leads(self, new_leads: pd.DataFrame) -> pd.DataFrame:
"""Score new leads and assign tier (A/B/C/D)."""
features = self.build_feature_matrix(new_leads)
features = features.reindex(columns=self.feature_columns, fill_value=0)
X = self.scaler.transform(features)
probabilities = self.model.predict_proba(X)[:, 1]
result = new_leads[['lead_id', 'name', 'company', 'email', 'job_title']].copy()
result['conversion_probability'] = probabilities.round(3)
result['score_100'] = (probabilities * 100).round(0).astype(int)
result['tier'] = pd.cut(
probabilities,
bins=[0, 0.2, 0.4, 0.65, 1.01],
labels=['D', 'C', 'B', 'A']
)
result['routing'] = result['tier'].map({
'A': 'Immediate AE outreach',
'B': 'SDR sequence + AE if responds',
'C': 'Nurture sequence (marketing)',
'D': 'Long-term nurture or discard'
})
return result.sort_values('conversion_probability', ascending=False)
def calculate_roi(self, scored_leads: pd.DataFrame,
avg_deal_value: float,
conversion_rates: dict) -> dict:
"""Calculate expected pipeline from scored lead batch."""
tier_counts = scored_leads['tier'].value_counts()
expected_opps = 0
expected_revenue = 0
for tier, count in tier_counts.items():
conv_rate = conversion_rates.get(tier, 0.1)
opps = count * conv_rate
revenue = opps * avg_deal_value
expected_opps += opps
expected_revenue += revenue
return {
'total_leads': len(scored_leads),
'tier_distribution': tier_counts.to_dict(),
'expected_opportunities': round(expected_opps),
'expected_pipeline': round(expected_revenue),
'roi_per_lead': round(expected_revenue / len(scored_leads))
}
Segment leads for different outreach strategies
def segment_leads_for_outreach(scored_leads: pd.DataFrame) -> dict:
"""Group leads into outreach segments."""
segments = {
'immediate_hot': scored_leads[scored_leads['score_100'] >= 80],
'follow_up_warm': scored_leads[
(scored_leads['score_100'] >= 50) & (scored_leads['score_100'] < 80)
],
'nurture_cool': scored_leads[
(scored_leads['score_100'] >= 20) & (scored_leads['score_100'] < 50)
],
'disqualify': scored_leads[scored_leads['score_100'] < 20]
}
return {k: v.to_dict('records') for k, v in segments.items()}
Results Companies Are Seeing
HubSpot research (2023):
Real implementation data (mid-size SaaS, 1,000 leads/month):
The model improves over time as it sees more outcomes — building a sustainable competitive advantage.
相关教程
How AI-powered sales content platforms help reps find and personalize materials faster
How to use AI to research prospects and write genuinely personalized outreach that gets replies
How to use AI to transcribe, analyze, and learn from every sales conversation