Coverage for backend / app / job_email_scraping / email_parsers / linkedin.py: 96%
80 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-18 23:17 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-18 23:17 +0000
1"""LinkedIn job alert email parser."""
3import re
5from bs4 import BeautifulSoup
7from app.job_email_scraping.email_parsers.utils import process_salary, Platform
8from app.job_email_scraping.schemas import Salary, JobInfo, JobResult
10BASE_URL = "https://www.linkedin.com/jobs/view/"
13def parse_linkedin_job_email(body: str) -> list[JobResult]:
14 """Parse LinkedIn job alert email and extract job information.
15 Compatible with both regular and forwarded email formats.
17 :param str body: email body
18 :return: list of JobResult objects containing job information
19 """
21 # Parse with BeautifulSoup
22 soup = BeautifulSoup(body, "html.parser", from_encoding="utf-8")
24 # Try primary method: Find all job cards with data-test-id (regular emails)
25 job_cards = soup.find_all("td", {"data-test-id": "job-card"})
27 # Fallback: If no job cards found, traverse up from title links (forwarded emails)
28 if not job_cards:
29 title_links = soup.find_all("a", class_="font-bold text-md leading-regular text-system-blue-50")
31 # For each title link, find the parent TD that contains all job info
32 seen_cards = set()
33 for link in title_links:
34 current = link
35 # Traverse up the tree to find the job card container
36 for _ in range(20): # Max 20 levels up
37 current = current.parent
38 if current is None:
39 break
41 if current.name == "td":
42 # Check if this TD contains both title and company info
43 has_title = (
44 current.find("a", class_="font-bold text-md leading-regular text-system-blue-50") is not None
45 )
46 has_company = (
47 current.find(
48 "p", class_="text-system-gray-100 text-xs leading-regular mt-0.5 line-clamp-1 text-ellipsis"
49 )
50 is not None
51 )
53 if has_title and has_company:
54 # Use id() to avoid duplicates
55 card_id = id(current)
56 if card_id not in seen_cards:
57 job_cards.append(current)
58 seen_cards.add(card_id)
59 break
61 # Extract information from each job card
62 jobs = []
64 for card in job_cards:
65 # Initialise variables
66 title = None
67 url = None
68 job_id = None
69 company = None
70 location = None
71 salary_currency = None
72 salary_min = None
73 salary_max = None
75 # Extract job title and URL
76 title_tag = card.find("a", class_="font-bold text-md leading-regular text-system-blue-50")
77 if title_tag:
78 title = " ".join(title_tag.get_text(strip=True).split())
79 url = title_tag.get("href", None)
81 if url:
82 # Extract job ID - handle both normal integers and scientific notation
83 jobid_pattern = r"linkedin\.com/(?:comm/)?jobs/view/([\d.e+]+)"
84 matches = re.findall(jobid_pattern, url, re.IGNORECASE)
85 if matches:
86 try:
87 # Convert scientific notation to integer string
88 job_id = str(int(float(matches[0])))
89 except (ValueError, OverflowError):
90 job_id = matches[0]
91 else:
92 # Handle collection URLs (e.g. top-applicant) with currentJobId query param
93 current_job_match = re.search(r"[?&]currentJobId=(\d+)", url, re.IGNORECASE)
94 if current_job_match:
95 job_id = current_job_match.group(1)
97 # Extract company name and location
98 company_location_tag = card.find(
99 "p", class_="text-system-gray-100 text-xs leading-regular mt-0.5 line-clamp-1 text-ellipsis"
100 )
101 if company_location_tag:
102 text = company_location_tag.get_text(strip=True)
103 parts = [part.strip() for part in text.split("·")]
104 if len(parts) >= 2:
105 company = " ".join(parts[0].split())
106 location = " ".join(parts[1].split())
108 # Extract salary information
109 salary_tag = card.find("p", class_="text-system-gray-70 text-xs leading-regular mt-0.5")
110 if salary_tag:
111 salary_text = salary_tag.get_text(strip=True)
112 match = re.search(r"([£$€])(\d+\.?\d*[KM]?)-([£$€])(\d+\.?\d*[KM]?)\s*/\s*(\w+)", salary_text)
113 if match:
114 salary_frequency = match.group(5)
115 if salary_frequency == "year":
116 salary_currency = match.group(1)
117 salary_min = process_salary(match.group(2))
118 salary_max = process_salary(match.group(4))
120 processed_url = BASE_URL + job_id
121 # Create Pydantic objects
122 salary = Salary(min_amount=salary_min, max_amount=salary_max, currency=salary_currency)
123 job_info = JobInfo(title=title, url=processed_url, raw_url=url, salary=salary)
124 job_result = JobResult(
125 company=company, job_id=job_id, location=location, job=job_info, platform=Platform.LINKEDIN
126 )
127 jobs.append(job_result)
129 return jobs
132def extract_alert_name(alert_string: str) -> str | None:
133 """Extract alert title from LinkedIn job alert email strings.
134 :param str alert_string: alert string from email
135 :return: extracted job title or None if not found"""
137 # Pattern: Extract text between quotes
138 pattern = r"“([^”]+)”"
139 match = re.search(pattern, alert_string)
140 if match:
141 return match.group(1).strip()
143 return None