AI Lead Scoring: Building Models That Identify Your Best Prospects

How to train ML models on your CRM data to automatically score and prioritize leads

返回教程列表
入门10 分钟

AI Lead Scoring: Building Models That Identify Your Best Prospects

How to train ML models on your CRM data to automatically score and prioritize leads

Learn how to build custom AI lead scoring systems that analyze behavioral signals, firmographic data, and engagement patterns to identify high-value prospects — helping sales teams focus effort on leads most likely to convert.

lead-scoringsales-aicrmmachine-learningpipeline

AI Lead Scoring: Identifying Your Best Prospects Automatically

Sales teams waste enormous time on leads that will never buy. AI lead scoring changes this — prioritizing leads based on actual signals that predict conversion, not arbitrary rules.

The Problem with Traditional Lead Scoring

Most companies use point-based scoring:

  • +10 points: Opened an email
  • +20 points: Visited pricing page
  • +50 points: Requested a demo
  • -5 points: Unsubscribed from one email
  • This is better than nothing, but it has fundamental problems:

  • Points are assigned by gut feel, not data
  • All behaviors weighted equally regardless of conversion correlation
  • Doesn't account for firmographic fit
  • Can't identify negative signals (visited careers page = not a buyer)
  • Building a Behavioral ML Lead Scoring Model

    python
    import pandas as pd
    import numpy as np
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import roc_auc_score, precision_recall_curve
    import json

    class LeadScoringModel: """ Predicts lead-to-opportunity conversion probability. Train on historical leads with known outcomes. """ def __init__(self): self.model = GradientBoostingClassifier( n_estimators=200, max_depth=4, learning_rate=0.05, subsample=0.8, random_state=42 ) self.scaler = StandardScaler() self.feature_columns = None self.conversion_threshold = 0.5 # Optimized during training def build_feature_matrix(self, leads_df: pd.DataFrame) -> pd.DataFrame: """ Build comprehensive feature set from lead data. Input: DataFrame with lead behavioral and firmographic data Output: Feature matrix for ML model """ features = pd.DataFrame() # ===== FIRMOGRAPHIC FIT ===== # Company size size_map = {'1-10': 1, '11-50': 2, '51-200': 3, '201-500': 4, '501-1000': 5, '1001-5000': 6, '5000+': 7} features['company_size_score'] = leads_df.get('company_size', '51-200').map(size_map).fillna(3) # Industry fit (score based on your historical conversion by industry) industry_fit = { 'Technology': 0.35, 'Financial Services': 0.28, 'Healthcare': 0.25, 'Retail': 0.18, 'Manufacturing': 0.20, 'Education': 0.15 } features['industry_fit_score'] = leads_df.get('industry', 'Other').map(industry_fit).fillna(0.15) # Job title/seniority decision_maker_titles = ['CEO', 'CTO', 'VP', 'Director', 'Head of', 'Chief'] features['is_decision_maker'] = leads_df.get('job_title', '').apply( lambda t: int(any(title in str(t) for title in decision_maker_titles)) ) # Technology stack fit (if you know what tech they use) features['tech_stack_score'] = leads_df.get('tech_stack_fit_score', 0.5).fillna(0.5) # ===== BEHAVIORAL ENGAGEMENT ===== # Email engagement features['email_open_rate'] = leads_df.get('email_open_rate', 0).fillna(0) features['email_click_rate'] = leads_df.get('email_click_rate', 0).fillna(0) features['email_count'] = leads_df.get('emails_received', 0).fillna(0) # Website behavior features['page_views'] = leads_df.get('total_page_views', 0).fillna(0) features['pricing_page_views'] = leads_df.get('pricing_page_views', 0).fillna(0) features['case_study_views'] = leads_df.get('case_study_views', 0).fillna(0) features['careers_page_views'] = leads_df.get('careers_page_views', 0).fillna(0) # NEGATIVE signal features['demo_page_visits'] = leads_df.get('demo_page_visits', 0).fillna(0) # Content engagement features['content_downloads'] = leads_df.get('content_downloads', 0).fillna(0) features['webinar_attendance'] = leads_df.get('webinars_attended', 0).fillna(0) # Intent signals features['free_trial_started'] = leads_df.get('free_trial_started', False).fillna(False).astype(int) features['demo_requested'] = leads_df.get('demo_requested', False).fillna(False).astype(int) features['pricing_requested'] = leads_df.get('pricing_requested', False).fillna(False).astype(int) # ===== TIMING SIGNALS ===== features['days_since_first_touch'] = leads_df.get('days_since_first_touch', 30).fillna(30) features['days_since_last_activity'] = leads_df.get('days_since_last_activity', 7).fillna(7) # Velocity (recent vs early engagement) features['recent_activity_score'] = leads_df.get('activity_last_14d', 0).fillna(0) features['engagement_velocity'] = ( features['recent_activity_score'] / (features['days_since_first_touch'] / 14 + 1) ) # ===== SOURCE QUALITY ===== source_conversion_rates = { 'Referral': 0.40, 'Organic Search': 0.25, 'Paid Search': 0.18, 'Content/Inbound': 0.22, 'Outbound SDR': 0.12, 'Social': 0.08, 'Event': 0.30, 'Partner': 0.35 } features['source_quality'] = leads_df.get('lead_source', 'Other').map( source_conversion_rates ).fillna(0.15) # ===== COMPOSITE SCORES ===== # High-intent composite features['high_intent_score'] = ( features['demo_requested'] * 3 + features['pricing_page_views'] * 2 + features['free_trial_started'] * 4 + features['pricing_requested'] * 3 - features['careers_page_views'] # Negative signal ) return features.fillna(0) def find_optimal_threshold(self, X_val, y_val) -> float: """Find the probability threshold that maximizes F1.""" probs = self.model.predict_proba(X_val)[:, 1] precisions, recalls, thresholds = precision_recall_curve(y_val, probs) f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8) optimal_idx = np.argmax(f1_scores) return thresholds[optimal_idx] if optimal_idx < len(thresholds) else 0.5 def score_leads(self, new_leads: pd.DataFrame) -> pd.DataFrame: """Score new leads and assign tier (A/B/C/D).""" features = self.build_feature_matrix(new_leads) features = features.reindex(columns=self.feature_columns, fill_value=0) X = self.scaler.transform(features) probabilities = self.model.predict_proba(X)[:, 1] result = new_leads[['lead_id', 'name', 'company', 'email', 'job_title']].copy() result['conversion_probability'] = probabilities.round(3) result['score_100'] = (probabilities * 100).round(0).astype(int) result['tier'] = pd.cut( probabilities, bins=[0, 0.2, 0.4, 0.65, 1.01], labels=['D', 'C', 'B', 'A'] ) result['routing'] = result['tier'].map({ 'A': 'Immediate AE outreach', 'B': 'SDR sequence + AE if responds', 'C': 'Nurture sequence (marketing)', 'D': 'Long-term nurture or discard' }) return result.sort_values('conversion_probability', ascending=False) def calculate_roi(self, scored_leads: pd.DataFrame, avg_deal_value: float, conversion_rates: dict) -> dict: """Calculate expected pipeline from scored lead batch.""" tier_counts = scored_leads['tier'].value_counts() expected_opps = 0 expected_revenue = 0 for tier, count in tier_counts.items(): conv_rate = conversion_rates.get(tier, 0.1) opps = count * conv_rate revenue = opps * avg_deal_value expected_opps += opps expected_revenue += revenue return { 'total_leads': len(scored_leads), 'tier_distribution': tier_counts.to_dict(), 'expected_opportunities': round(expected_opps), 'expected_pipeline': round(expected_revenue), 'roi_per_lead': round(expected_revenue / len(scored_leads)) }

    Segment leads for different outreach strategies

    def segment_leads_for_outreach(scored_leads: pd.DataFrame) -> dict: """Group leads into outreach segments.""" segments = { 'immediate_hot': scored_leads[scored_leads['score_100'] >= 80], 'follow_up_warm': scored_leads[ (scored_leads['score_100'] >= 50) & (scored_leads['score_100'] < 80) ], 'nurture_cool': scored_leads[ (scored_leads['score_100'] >= 20) & (scored_leads['score_100'] < 50) ], 'disqualify': scored_leads[scored_leads['score_100'] < 20] } return {k: v.to_dict('records') for k, v in segments.items()}

    Results Companies Are Seeing

    HubSpot research (2023):

  • Companies with AI lead scoring convert leads at 2-3x higher rates
  • Average 50% reduction in time spent on unqualified leads
  • 30% improvement in sales productivity
  • Real implementation data (mid-size SaaS, 1,000 leads/month):

  • Before AI: 12% lead-to-opportunity rate
  • After AI: 28% lead-to-opportunity rate (same volume, better selection)
  • Sales cycle shortened by 20% (reps focused on high-fit leads)
  • The model improves over time as it sees more outcomes — building a sustainable competitive advantage.