← Back to tutorials

AI Lead Scoring: Building Models That Identify Your Best Prospects

How to train ML models on your CRM data to automatically score and prioritize leads

AI Lead Scoring: Identifying Your Best Prospects Automatically

Sales teams waste enormous time on leads that will never buy. AI lead scoring changes this — prioritizing leads based on actual signals that predict conversion, not arbitrary rules.

The Problem with Traditional Lead Scoring

Most companies use point-based scoring:

  • +10 points: Opened an email
  • +20 points: Visited pricing page
  • +50 points: Requested a demo
  • -5 points: Unsubscribed from one email
  • This is better than nothing, but it has fundamental problems:

  • Points are assigned by gut feel, not data
  • All behaviors weighted equally regardless of conversion correlation
  • Doesn't account for firmographic fit
  • Can't identify negative signals (visited careers page = not a buyer)
  • Building a Behavioral ML Lead Scoring Model

    python
    import pandas as pd
    import numpy as np
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import roc_auc_score, precision_recall_curve
    import json

    class LeadScoringModel: """ Predicts lead-to-opportunity conversion probability. Train on historical leads with known outcomes. """ def __init__(self): self.model = GradientBoostingClassifier( n_estimators=200, max_depth=4, learning_rate=0.05, subsample=0.8, random_state=42 ) self.scaler = StandardScaler() self.feature_columns = None self.conversion_threshold = 0.5 # Optimized during training def build_feature_matrix(self, leads_df: pd.DataFrame) -> pd.DataFrame: """ Build comprehensive feature set from lead data. Input: DataFrame with lead behavioral and firmographic data Output: Feature matrix for ML model """ features = pd.DataFrame() # ===== FIRMOGRAPHIC FIT ===== # Company size size_map = {'1-10': 1, '11-50': 2, '51-200': 3, '201-500': 4, '501-1000': 5, '1001-5000': 6, '5000+': 7} features['company_size_score'] = leads_df.get('company_size', '51-200').map(size_map).fillna(3) # Industry fit (score based on your historical conversion by industry) industry_fit = { 'Technology': 0.35, 'Financial Services': 0.28, 'Healthcare': 0.25, 'Retail': 0.18, 'Manufacturing': 0.20, 'Education': 0.15 } features['industry_fit_score'] = leads_df.get('industry', 'Other').map(industry_fit).fillna(0.15) # Job title/seniority decision_maker_titles = ['CEO', 'CTO', 'VP', 'Director', 'Head of', 'Chief'] features['is_decision_maker'] = leads_df.get('job_title', '').apply( lambda t: int(any(title in str(t) for title in decision_maker_titles)) ) # Technology stack fit (if you know what tech they use) features['tech_stack_score'] = leads_df.get('tech_stack_fit_score', 0.5).fillna(0.5) # ===== BEHAVIORAL ENGAGEMENT ===== # Email engagement features['email_open_rate'] = leads_df.get('email_open_rate', 0).fillna(0) features['email_click_rate'] = leads_df.get('email_click_rate', 0).fillna(0) features['email_count'] = leads_df.get('emails_received', 0).fillna(0) # Website behavior features['page_views'] = leads_df.get('total_page_views', 0).fillna(0) features['pricing_page_views'] = leads_df.get('pricing_page_views', 0).fillna(0) features['case_study_views'] = leads_df.get('case_study_views', 0).fillna(0) features['careers_page_views'] = leads_df.get('careers_page_views', 0).fillna(0) # NEGATIVE signal features['demo_page_visits'] = leads_df.get('demo_page_visits', 0).fillna(0) # Content engagement features['content_downloads'] = leads_df.get('content_downloads', 0).fillna(0) features['webinar_attendance'] = leads_df.get('webinars_attended', 0).fillna(0) # Intent signals features['free_trial_started'] = leads_df.get('free_trial_started', False).fillna(False).astype(int) features['demo_requested'] = leads_df.get('demo_requested', False).fillna(False).astype(int) features['pricing_requested'] = leads_df.get('pricing_requested', False).fillna(False).astype(int) # ===== TIMING SIGNALS ===== features['days_since_first_touch'] = leads_df.get('days_since_first_touch', 30).fillna(30) features['days_since_last_activity'] = leads_df.get('days_since_last_activity', 7).fillna(7) # Velocity (recent vs early engagement) features['recent_activity_score'] = leads_df.get('activity_last_14d', 0).fillna(0) features['engagement_velocity'] = ( features['recent_activity_score'] / (features['days_since_first_touch'] / 14 + 1) ) # ===== SOURCE QUALITY ===== source_conversion_rates = { 'Referral': 0.40, 'Organic Search': 0.25, 'Paid Search': 0.18, 'Content/Inbound': 0.22, 'Outbound SDR': 0.12, 'Social': 0.08, 'Event': 0.30, 'Partner': 0.35 } features['source_quality'] = leads_df.get('lead_source', 'Other').map( source_conversion_rates ).fillna(0.15) # ===== COMPOSITE SCORES ===== # High-intent composite features['high_intent_score'] = ( features['demo_requested'] * 3 + features['pricing_page_views'] * 2 + features['free_trial_started'] * 4 + features['pricing_requested'] * 3 - features['careers_page_views'] # Negative signal ) return features.fillna(0) def find_optimal_threshold(self, X_val, y_val) -> float: """Find the probability threshold that maximizes F1.""" probs = self.model.predict_proba(X_val)[:, 1] precisions, recalls, thresholds = precision_recall_curve(y_val, probs) f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8) optimal_idx = np.argmax(f1_scores) return thresholds[optimal_idx] if optimal_idx < len(thresholds) else 0.5 def score_leads(self, new_leads: pd.DataFrame) -> pd.DataFrame: """Score new leads and assign tier (A/B/C/D).""" features = self.build_feature_matrix(new_leads) features = features.reindex(columns=self.feature_columns, fill_value=0) X = self.scaler.transform(features) probabilities = self.model.predict_proba(X)[:, 1] result = new_leads[['lead_id', 'name', 'company', 'email', 'job_title']].copy() result['conversion_probability'] = probabilities.round(3) result['score_100'] = (probabilities * 100).round(0).astype(int) result['tier'] = pd.cut( probabilities, bins=[0, 0.2, 0.4, 0.65, 1.01], labels=['D', 'C', 'B', 'A'] ) result['routing'] = result['tier'].map({ 'A': 'Immediate AE outreach', 'B': 'SDR sequence + AE if responds', 'C': 'Nurture sequence (marketing)', 'D': 'Long-term nurture or discard' }) return result.sort_values('conversion_probability', ascending=False) def calculate_roi(self, scored_leads: pd.DataFrame, avg_deal_value: float, conversion_rates: dict) -> dict: """Calculate expected pipeline from scored lead batch.""" tier_counts = scored_leads['tier'].value_counts() expected_opps = 0 expected_revenue = 0 for tier, count in tier_counts.items(): conv_rate = conversion_rates.get(tier, 0.1) opps = count * conv_rate revenue = opps * avg_deal_value expected_opps += opps expected_revenue += revenue return { 'total_leads': len(scored_leads), 'tier_distribution': tier_counts.to_dict(), 'expected_opportunities': round(expected_opps), 'expected_pipeline': round(expected_revenue), 'roi_per_lead': round(expected_revenue / len(scored_leads)) }

    Segment leads for different outreach strategies

    def segment_leads_for_outreach(scored_leads: pd.DataFrame) -> dict: """Group leads into outreach segments.""" segments = { 'immediate_hot': scored_leads[scored_leads['score_100'] >= 80], 'follow_up_warm': scored_leads[ (scored_leads['score_100'] >= 50) & (scored_leads['score_100'] < 80) ], 'nurture_cool': scored_leads[ (scored_leads['score_100'] >= 20) & (scored_leads['score_100'] < 50) ], 'disqualify': scored_leads[scored_leads['score_100'] < 20] } return {k: v.to_dict('records') for k, v in segments.items()}

    Results Companies Are Seeing

    HubSpot research (2023):

  • Companies with AI lead scoring convert leads at 2-3x higher rates
  • Average 50% reduction in time spent on unqualified leads
  • 30% improvement in sales productivity
  • Real implementation data (mid-size SaaS, 1,000 leads/month):

  • Before AI: 12% lead-to-opportunity rate
  • After AI: 28% lead-to-opportunity rate (same volume, better selection)
  • Sales cycle shortened by 20% (reps focused on high-fit leads)
  • The model improves over time as it sees more outcomes — building a sustainable competitive advantage.

    Also available in 中文.

    AI Lead Scoring: Building Models That Identify Your Best Prospects | AI Skill Navigation | AI Skill Navigation