Coverage for backend/app/job_email_scraping/email

1"""LinkedIn job alert email parser."""

3import re

5from bs4 import BeautifulSoup

7from app.job_email_scraping.email_parsers.utils import process_salary, Platform

8from app.job_email_scraping.schemas import Salary, JobInfo, JobResult

10BASE_URL = "https://www.linkedin.com/jobs/view/"

13def parse_linkedin_job_email(body: str) -> list[JobResult]:

14 """Parse LinkedIn job alert email and extract job information.

15 Compatible with both regular and forwarded email formats.

17 :param str body: email body

18 :return: list of JobResult objects containing job information

19 """

21 # Parse with BeautifulSoup

22 soup = BeautifulSoup(body, "html.parser", from_encoding="utf-8")

24 # Try primary method: Find all job cards with data-test-id (regular emails)

25 job_cards = soup.find_all("td", {"data-test-id": "job-card"})

27 # Fallback: If no job cards found, traverse up from title links (forwarded emails)

28 if not job_cards:

29 title_links = soup.find_all("a", class_="font-bold text-md leading-regular text-system-blue-50")

31 # For each title link, find the parent TD that contains all job info

32 seen_cards = set()

33 for link in title_links:

34 current = link

35 # Traverse up the tree to find the job card container

36 for _ in range(20): # Max 20 levels up

37 current = current.parent

38 if current is None:

39 break

41 if current.name == "td":

42 # Check if this TD contains both title and company info

43 has_title = (

44 current.find("a", class_="font-bold text-md leading-regular text-system-blue-50") is not None

45 )

46 has_company = (

47 current.find(

48 "p", class_="text-system-gray-100 text-xs leading-regular mt-0.5 line-clamp-1 text-ellipsis"

49 )

50 is not None

51 )

53 if has_title and has_company:

54 # Use id() to avoid duplicates

55 card_id = id(current)

56 if card_id not in seen_cards:

57 job_cards.append(current)

58 seen_cards.add(card_id)

59 break

61 # Extract information from each job card

62 jobs = []

64 for card in job_cards:

65 # Initialise variables

66 title = None

67 url = None

68 job_id = None

69 company = None

70 location = None

71 salary_currency = None

72 salary_min = None

73 salary_max = None

75 # Extract job title and URL

76 title_tag = card.find("a", class_="font-bold text-md leading-regular text-system-blue-50")

77 if title_tag:

78 title = " ".join(title_tag.get_text(strip=True).split())

79 url = title_tag.get("href", None)

81 if url:

82 # Extract job ID - handle both normal integers and scientific notation

83 jobid_pattern = r"linkedin\.com/(?:comm/)?jobs/view/([\d.e+]+)"

84 matches = re.findall(jobid_pattern, url, re.IGNORECASE)

85 if matches:

86 try:

87 # Convert scientific notation to integer string

88 job_id = str(int(float(matches[0])))

89 except (ValueError, OverflowError):

90 job_id = matches[0]

91 else:

92 # Handle collection URLs (e.g. top-applicant) with currentJobId query param

93 current_job_match = re.search(r"[?&]currentJobId=(\d+)", url, re.IGNORECASE)

94 if current_job_match:

95 job_id = current_job_match.group(1)

97 # Extract company name and location

98 company_location_tag = card.find(

99 "p", class_="text-system-gray-100 text-xs leading-regular mt-0.5 line-clamp-1 text-ellipsis"

100 )

101 if company_location_tag:

102 text = company_location_tag.get_text(strip=True)

103 parts = [part.strip() for part in text.split("·")]

104 if len(parts) >= 2:

105 company = " ".join(parts[0].split())

106 location = " ".join(parts[1].split())

107

108 # Extract salary information

109 salary_tag = card.find("p", class_="text-system-gray-70 text-xs leading-regular mt-0.5")

110 if salary_tag:

111 salary_text = salary_tag.get_text(strip=True)

112 match = re.search(r"([£$€])(\d+\.?\d*[KM]?)-([£$€])(\d+\.?\d*[KM]?)\s*/\s*(\w+)", salary_text)

113 if match:

114 salary_frequency = match.group(5)

115 if salary_frequency == "year":

116 salary_currency = match.group(1)

117 salary_min = process_salary(match.group(2))

118 salary_max = process_salary(match.group(4))

119

120 processed_url = BASE_URL + job_id

121 # Create Pydantic objects

122 salary = Salary(min_amount=salary_min, max_amount=salary_max, currency=salary_currency)

123 job_info = JobInfo(title=title, url=processed_url, raw_url=url, salary=salary)

124 job_result = JobResult(

125 company=company, job_id=job_id, location=location, job=job_info, platform=Platform.LINKEDIN

126 )

127 jobs.append(job_result)

128

129 return jobs

130

131

132def extract_alert_name(alert_string: str) -> str | None:

133 """Extract alert title from LinkedIn job alert email strings.

134 :param str alert_string: alert string from email

135 :return: extracted job title or None if not found"""

136

137 # Pattern: Extract text between quotes

138 pattern = r"“([^”]+)”"

139 match = re.search(pattern, alert_string)

140 if match:

141 return match.group(1).strip()

142

143 return None

Coverage for backend / app / job_email_scraping / email_parsers / linkedin.py: 96%

80 statements