Coverage for backend / app / job_email_scraping / email_parsers / linkedin.py: 96%

80 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-18 23:17 +0000

1"""LinkedIn job alert email parser.""" 

2 

3import re 

4 

5from bs4 import BeautifulSoup 

6 

7from app.job_email_scraping.email_parsers.utils import process_salary, Platform 

8from app.job_email_scraping.schemas import Salary, JobInfo, JobResult 

9 

10BASE_URL = "https://www.linkedin.com/jobs/view/" 

11 

12 

13def parse_linkedin_job_email(body: str) -> list[JobResult]: 

14 """Parse LinkedIn job alert email and extract job information. 

15 Compatible with both regular and forwarded email formats. 

16 

17 :param str body: email body 

18 :return: list of JobResult objects containing job information 

19 """ 

20 

21 # Parse with BeautifulSoup 

22 soup = BeautifulSoup(body, "html.parser", from_encoding="utf-8") 

23 

24 # Try primary method: Find all job cards with data-test-id (regular emails) 

25 job_cards = soup.find_all("td", {"data-test-id": "job-card"}) 

26 

27 # Fallback: If no job cards found, traverse up from title links (forwarded emails) 

28 if not job_cards: 

29 title_links = soup.find_all("a", class_="font-bold text-md leading-regular text-system-blue-50") 

30 

31 # For each title link, find the parent TD that contains all job info 

32 seen_cards = set() 

33 for link in title_links: 

34 current = link 

35 # Traverse up the tree to find the job card container 

36 for _ in range(20): # Max 20 levels up 

37 current = current.parent 

38 if current is None: 

39 break 

40 

41 if current.name == "td": 

42 # Check if this TD contains both title and company info 

43 has_title = ( 

44 current.find("a", class_="font-bold text-md leading-regular text-system-blue-50") is not None 

45 ) 

46 has_company = ( 

47 current.find( 

48 "p", class_="text-system-gray-100 text-xs leading-regular mt-0.5 line-clamp-1 text-ellipsis" 

49 ) 

50 is not None 

51 ) 

52 

53 if has_title and has_company: 

54 # Use id() to avoid duplicates 

55 card_id = id(current) 

56 if card_id not in seen_cards: 

57 job_cards.append(current) 

58 seen_cards.add(card_id) 

59 break 

60 

61 # Extract information from each job card 

62 jobs = [] 

63 

64 for card in job_cards: 

65 # Initialise variables 

66 title = None 

67 url = None 

68 job_id = None 

69 company = None 

70 location = None 

71 salary_currency = None 

72 salary_min = None 

73 salary_max = None 

74 

75 # Extract job title and URL 

76 title_tag = card.find("a", class_="font-bold text-md leading-regular text-system-blue-50") 

77 if title_tag: 

78 title = " ".join(title_tag.get_text(strip=True).split()) 

79 url = title_tag.get("href", None) 

80 

81 if url: 

82 # Extract job ID - handle both normal integers and scientific notation 

83 jobid_pattern = r"linkedin\.com/(?:comm/)?jobs/view/([\d.e+]+)" 

84 matches = re.findall(jobid_pattern, url, re.IGNORECASE) 

85 if matches: 

86 try: 

87 # Convert scientific notation to integer string 

88 job_id = str(int(float(matches[0]))) 

89 except (ValueError, OverflowError): 

90 job_id = matches[0] 

91 else: 

92 # Handle collection URLs (e.g. top-applicant) with currentJobId query param 

93 current_job_match = re.search(r"[?&]currentJobId=(\d+)", url, re.IGNORECASE) 

94 if current_job_match: 

95 job_id = current_job_match.group(1) 

96 

97 # Extract company name and location 

98 company_location_tag = card.find( 

99 "p", class_="text-system-gray-100 text-xs leading-regular mt-0.5 line-clamp-1 text-ellipsis" 

100 ) 

101 if company_location_tag: 

102 text = company_location_tag.get_text(strip=True) 

103 parts = [part.strip() for part in text.split("·")] 

104 if len(parts) >= 2: 

105 company = " ".join(parts[0].split()) 

106 location = " ".join(parts[1].split()) 

107 

108 # Extract salary information 

109 salary_tag = card.find("p", class_="text-system-gray-70 text-xs leading-regular mt-0.5") 

110 if salary_tag: 

111 salary_text = salary_tag.get_text(strip=True) 

112 match = re.search(r"([£$€])(\d+\.?\d*[KM]?)-([£$€])(\d+\.?\d*[KM]?)\s*/\s*(\w+)", salary_text) 

113 if match: 

114 salary_frequency = match.group(5) 

115 if salary_frequency == "year": 

116 salary_currency = match.group(1) 

117 salary_min = process_salary(match.group(2)) 

118 salary_max = process_salary(match.group(4)) 

119 

120 processed_url = BASE_URL + job_id 

121 # Create Pydantic objects 

122 salary = Salary(min_amount=salary_min, max_amount=salary_max, currency=salary_currency) 

123 job_info = JobInfo(title=title, url=processed_url, raw_url=url, salary=salary) 

124 job_result = JobResult( 

125 company=company, job_id=job_id, location=location, job=job_info, platform=Platform.LINKEDIN 

126 ) 

127 jobs.append(job_result) 

128 

129 return jobs 

130 

131 

132def extract_alert_name(alert_string: str) -> str | None: 

133 """Extract alert title from LinkedIn job alert email strings. 

134 :param str alert_string: alert string from email 

135 :return: extracted job title or None if not found""" 

136 

137 # Pattern: Extract text between quotes 

138 pattern = r"“([^”]+)”" 

139 match = re.search(pattern, alert_string) 

140 if match: 

141 return match.group(1).strip() 

142 

143 return None