AI Lead Scoring: Building Models That Identify Your Best Prospects
How to train ML models on your CRM data to automatically score and prioritize leads
AI Lead Scoring: Identifying Your Best Prospects Automatically
Sales teams waste enormous time on leads that will never buy. AI lead scoring changes this — prioritizing leads based on actual signals that predict conversion, not arbitrary rules.
The Problem with Traditional Lead Scoring
Most companies use point-based scoring:
This is better than nothing, but it has fundamental problems:
Building a Behavioral ML Lead Scoring Model
python
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, precision_recall_curve
import jsonclass LeadScoringModel:
"""
Predicts lead-to-opportunity conversion probability.
Train on historical leads with known outcomes.
"""
def __init__(self):
self.model = GradientBoostingClassifier(
n_estimators=200, max_depth=4, learning_rate=0.05,
subsample=0.8, random_state=42
)
self.scaler = StandardScaler()
self.feature_columns = None
self.conversion_threshold = 0.5 # Optimized during training
def build_feature_matrix(self, leads_df: pd.DataFrame) -> pd.DataFrame:
"""
Build comprehensive feature set from lead data.
Input: DataFrame with lead behavioral and firmographic data
Output: Feature matrix for ML model
"""
features = pd.DataFrame()
# ===== FIRMOGRAPHIC FIT =====
# Company size
size_map = {'1-10': 1, '11-50': 2, '51-200': 3, '201-500': 4,
'501-1000': 5, '1001-5000': 6, '5000+': 7}
features['company_size_score'] = leads_df.get('company_size', '51-200').map(size_map).fillna(3)
# Industry fit (score based on your historical conversion by industry)
industry_fit = {
'Technology': 0.35, 'Financial Services': 0.28, 'Healthcare': 0.25,
'Retail': 0.18, 'Manufacturing': 0.20, 'Education': 0.15
}
features['industry_fit_score'] = leads_df.get('industry', 'Other').map(industry_fit).fillna(0.15)
# Job title/seniority
decision_maker_titles = ['CEO', 'CTO', 'VP', 'Director', 'Head of', 'Chief']
features['is_decision_maker'] = leads_df.get('job_title', '').apply(
lambda t: int(any(title in str(t) for title in decision_maker_titles))
)
# Technology stack fit (if you know what tech they use)
features['tech_stack_score'] = leads_df.get('tech_stack_fit_score', 0.5).fillna(0.5)
# ===== BEHAVIORAL ENGAGEMENT =====
# Email engagement
features['email_open_rate'] = leads_df.get('email_open_rate', 0).fillna(0)
features['email_click_rate'] = leads_df.get('email_click_rate', 0).fillna(0)
features['email_count'] = leads_df.get('emails_received', 0).fillna(0)
# Website behavior
features['page_views'] = leads_df.get('total_page_views', 0).fillna(0)
features['pricing_page_views'] = leads_df.get('pricing_page_views', 0).fillna(0)
features['case_study_views'] = leads_df.get('case_study_views', 0).fillna(0)
features['careers_page_views'] = leads_df.get('careers_page_views', 0).fillna(0) # NEGATIVE signal
features['demo_page_visits'] = leads_df.get('demo_page_visits', 0).fillna(0)
# Content engagement
features['content_downloads'] = leads_df.get('content_downloads', 0).fillna(0)
features['webinar_attendance'] = leads_df.get('webinars_attended', 0).fillna(0)
# Intent signals
features['free_trial_started'] = leads_df.get('free_trial_started', False).fillna(False).astype(int)
features['demo_requested'] = leads_df.get('demo_requested', False).fillna(False).astype(int)
features['pricing_requested'] = leads_df.get('pricing_requested', False).fillna(False).astype(int)
# ===== TIMING SIGNALS =====
features['days_since_first_touch'] = leads_df.get('days_since_first_touch', 30).fillna(30)
features['days_since_last_activity'] = leads_df.get('days_since_last_activity', 7).fillna(7)
# Velocity (recent vs early engagement)
features['recent_activity_score'] = leads_df.get('activity_last_14d', 0).fillna(0)
features['engagement_velocity'] = (
features['recent_activity_score'] / (features['days_since_first_touch'] / 14 + 1)
)
# ===== SOURCE QUALITY =====
source_conversion_rates = {
'Referral': 0.40, 'Organic Search': 0.25, 'Paid Search': 0.18,
'Content/Inbound': 0.22, 'Outbound SDR': 0.12, 'Social': 0.08,
'Event': 0.30, 'Partner': 0.35
}
features['source_quality'] = leads_df.get('lead_source', 'Other').map(
source_conversion_rates
).fillna(0.15)
# ===== COMPOSITE SCORES =====
# High-intent composite
features['high_intent_score'] = (
features['demo_requested'] * 3 +
features['pricing_page_views'] * 2 +
features['free_trial_started'] * 4 +
features['pricing_requested'] * 3 -
features['careers_page_views'] # Negative signal
)
return features.fillna(0)
def find_optimal_threshold(self, X_val, y_val) -> float:
"""Find the probability threshold that maximizes F1."""
probs = self.model.predict_proba(X_val)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_val, probs)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)
optimal_idx = np.argmax(f1_scores)
return thresholds[optimal_idx] if optimal_idx < len(thresholds) else 0.5
def score_leads(self, new_leads: pd.DataFrame) -> pd.DataFrame:
"""Score new leads and assign tier (A/B/C/D)."""
features = self.build_feature_matrix(new_leads)
features = features.reindex(columns=self.feature_columns, fill_value=0)
X = self.scaler.transform(features)
probabilities = self.model.predict_proba(X)[:, 1]
result = new_leads[['lead_id', 'name', 'company', 'email', 'job_title']].copy()
result['conversion_probability'] = probabilities.round(3)
result['score_100'] = (probabilities * 100).round(0).astype(int)
result['tier'] = pd.cut(
probabilities,
bins=[0, 0.2, 0.4, 0.65, 1.01],
labels=['D', 'C', 'B', 'A']
)
result['routing'] = result['tier'].map({
'A': 'Immediate AE outreach',
'B': 'SDR sequence + AE if responds',
'C': 'Nurture sequence (marketing)',
'D': 'Long-term nurture or discard'
})
return result.sort_values('conversion_probability', ascending=False)
def calculate_roi(self, scored_leads: pd.DataFrame,
avg_deal_value: float,
conversion_rates: dict) -> dict:
"""Calculate expected pipeline from scored lead batch."""
tier_counts = scored_leads['tier'].value_counts()
expected_opps = 0
expected_revenue = 0
for tier, count in tier_counts.items():
conv_rate = conversion_rates.get(tier, 0.1)
opps = count * conv_rate
revenue = opps * avg_deal_value
expected_opps += opps
expected_revenue += revenue
return {
'total_leads': len(scored_leads),
'tier_distribution': tier_counts.to_dict(),
'expected_opportunities': round(expected_opps),
'expected_pipeline': round(expected_revenue),
'roi_per_lead': round(expected_revenue / len(scored_leads))
}
Segment leads for different outreach strategies
def segment_leads_for_outreach(scored_leads: pd.DataFrame) -> dict:
"""Group leads into outreach segments."""
segments = {
'immediate_hot': scored_leads[scored_leads['score_100'] >= 80],
'follow_up_warm': scored_leads[
(scored_leads['score_100'] >= 50) & (scored_leads['score_100'] < 80)
],
'nurture_cool': scored_leads[
(scored_leads['score_100'] >= 20) & (scored_leads['score_100'] < 50)
],
'disqualify': scored_leads[scored_leads['score_100'] < 20]
}
return {k: v.to_dict('records') for k, v in segments.items()}
Results Companies Are Seeing
HubSpot research (2023):
Real implementation data (mid-size SaaS, 1,000 leads/month):
The model improves over time as it sees more outcomes — building a sustainable competitive advantage.
Also available in 中文.