Coverage for backend/app/job_email_scraping/email

1"""Indeed job alert email parser."""

3import re

5import cloudscraper

6from bs4 import BeautifulSoup

8from app.job_email_scraping.email_parsers.utils import process_salary, Platform

9from app.job_email_scraping.schemas import Salary, JobInfo, JobResult

11BASE_URL = "https://www.indeed.com/viewjob?jk="

14def get_indeed_redirected_url(job_url: str, max_attempts: int = 100) -> str:

15 """Get the redirected URL from the Indeed job URL

16 :param job_url: Indeed job URL

17 :param max_attempts: max number of attempts to get redirected

18 :return: redirected URL"""

20 iteration = 0

21 url = job_url

22 while "indeed.com/viewjob?jk" not in url:

23 scraper = cloudscraper.create_scraper()

24 response = scraper.get(job_url, allow_redirects=True)

25 url = response.url

26 iteration += 1

27 if iteration > max_attempts:

28 raise AssertionError(f"Too many attempts to get redirect for url {job_url}")

29 return url

32def parse_indeed_job_email(body: str) -> list[JobResult]:

33 """Parse Indeed job alert email and extract job information.

34 :param str body: path to the HTML file

35 :return: list of dictionaries containing job information"""

37 # Parse with BeautifulSoup

38 soup = BeautifulSoup(body, "html.parser", from_encoding="utf-8")

40 # Find all job sections (each job is in a td with class 'pb-24')

41 job_sections = soup.find_all("td", {"class": "pb-24"})

43 jobs = []

45 for section in job_sections:

46 # Initialise variables

47 title = None

48 url = None

49 job_id = None

50 company = None

51 location = None

52 salary_currency = None

53 salary_min = None

54 salary_max = None

56 # Extract job title and URL

57 title_h2 = section.find("h2")

58 if title_h2:

59 title_link = title_h2.find("a", class_="strong-text-link")

60 if title_link:

61 title = " ".join(title_link.get_text(strip=True).split())

63 # URL

64 url = title_link.get("href")

66 # Job ID

67 jobid_pattern = r"[?&]jk=([a-zA-Z0-9]+)"

68 ad_match = re.search(jobid_pattern, url, re.IGNORECASE)

69 if not ad_match:

70 redirected_url = get_indeed_redirected_url(url)

71 else:

72 redirected_url = url

74 matches = re.findall(jobid_pattern, redirected_url, re.IGNORECASE)

75 if matches:

76 job_id = matches[0]

78 # Company name

79 company_table = section.find("table", {"role": "presentation"})

80 if company_table:

81 company_tds = company_table.find_all("td", style=lambda x: x and "padding:0 12px 0 0" in x)

82 if company_tds:

83 company = " ".join(company_tds[0].get_text(strip=True).split())

85 # Location

86 all_tds = section.find_all("td", {"align": "left", "valign": "top"})

87 for td in all_tds:

88 style = td.get("style", "")

89 if "color:#2d2d2d;font-size:14px;line-height:21px" in style:

90 text = td.get_text(strip=True)

91 # Skip if it's the company (has rating sibling) or if it contains description text

92 if len(text) < 50 and not td.find("table"):

93 # Check if this might be location by seeing if it comes after company info

94 location = text

96 # Salary

97 salary_table = section.find("table", {"bgcolor": "#f3f2f1"})

98 if salary_table:

99 salary_td = salary_table.find("td", style=lambda x: x and "padding:3px 8px 3px 8px" in x)

100 if salary_td:

101 salary_text = salary_td.get_text(strip=True)

102

103 # Pattern for salaries like £39,906 - £42,254 a year

104 match = re.search(r"([£$€])([\d,]+)\s*-\s*([£$€])([\d,]+)\s*a\s*(\w+)", salary_text)

105 if match:

106 frequency = match.group(5)

107 if frequency == "year":

108 salary_currency = match.group(1)

109 salary_min = process_salary(match.group(2))

110 salary_max = process_salary(match.group(4))

111

112 # Pattern for single salary like £40,000 a year

113 match = re.search(r"([£$€])([\d,]+)\s*a\s*(\w+)", salary_text)

114 if match:

115 frequency = match.group(3)

116 if frequency == "a year":

117 salary_currency = match.group(1)

118 salary_min = salary_max = process_salary(match.group(2))

119

120 if job_id is None:

121 raise AssertionError(f"Job id not found for url {url}")

122 processed_url = BASE_URL + job_id

123 salary = Salary(min_amount=salary_min, max_amount=salary_max, currency=salary_currency)

124 job_info = JobInfo(title=title, url=processed_url, salary=salary, raw_url=url)

125 job_result = JobResult(

126 company=company, job_id=job_id, location=location, job=job_info, platform=Platform.INDEED

127 )

128 jobs.append(job_result)

129

130 return jobs

131

132

133def extract_alert_name(title: str) -> str | None:

134 """Parse Indeed email alert name from email title.

135 :param str title: email title

136 :return: email alert name or None if not found"""

137

138 # Pattern 1: "X more [job title] job" or "X new [job title] job"

139 pattern1 = r"(?:\d+\s+(?:more|new)\s+)([\w\s&/\-]+?)\s+jobs?"

140

141 # Pattern 2: "+ X new [job title] jobs" (after "hiring for")

142 pattern2 = r"\+\s+\d+\s+new\s+([\w\s&/\-]+?)\s+jobs?"

143

144 # Try pattern 2 first (more specific)

145 match = re.search(pattern2, title, re.IGNORECASE)

146 if match:

147 return match.group(1).strip()

148

149 # Try pattern 1

150 match = re.search(pattern1, title, re.IGNORECASE)

151 if match:

152 return match.group(1).strip()

153

154 return None

Coverage for backend / app / job_email_scraping / email_parsers / indeed.py: 86%

92 statements