Coverage for backend / app / job_email_scraping / email_parsers / indeed.py: 86%

92 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-17 21:34 +0000

1"""Indeed job alert email parser.""" 

2 

3import re 

4 

5import cloudscraper 

6from bs4 import BeautifulSoup 

7 

8from app.job_email_scraping.email_parsers.utils import process_salary, Platform 

9from app.job_email_scraping.schemas import Salary, JobInfo, JobResult 

10 

11BASE_URL = "https://www.indeed.com/viewjob?jk=" 

12 

13 

14def get_indeed_redirected_url(job_url: str, max_attempts: int = 100) -> str: 

15 """Get the redirected URL from the Indeed job URL 

16 :param job_url: Indeed job URL 

17 :param max_attempts: max number of attempts to get redirected 

18 :return: redirected URL""" 

19 

20 iteration = 0 

21 url = job_url 

22 while "indeed.com/viewjob?jk" not in url: 

23 scraper = cloudscraper.create_scraper() 

24 response = scraper.get(job_url, allow_redirects=True) 

25 url = response.url 

26 iteration += 1 

27 if iteration > max_attempts: 

28 raise AssertionError(f"Too many attempts to get redirect for url {job_url}") 

29 return url 

30 

31 

32def parse_indeed_job_email(body: str) -> list[JobResult]: 

33 """Parse Indeed job alert email and extract job information. 

34 :param str body: path to the HTML file 

35 :return: list of dictionaries containing job information""" 

36 

37 # Parse with BeautifulSoup 

38 soup = BeautifulSoup(body, "html.parser", from_encoding="utf-8") 

39 

40 # Find all job sections (each job is in a td with class 'pb-24') 

41 job_sections = soup.find_all("td", {"class": "pb-24"}) 

42 

43 jobs = [] 

44 

45 for section in job_sections: 

46 # Initialise variables 

47 title = None 

48 url = None 

49 job_id = None 

50 company = None 

51 location = None 

52 salary_currency = None 

53 salary_min = None 

54 salary_max = None 

55 

56 # Extract job title and URL 

57 title_h2 = section.find("h2") 

58 if title_h2: 

59 title_link = title_h2.find("a", class_="strong-text-link") 

60 if title_link: 

61 title = " ".join(title_link.get_text(strip=True).split()) 

62 

63 # URL 

64 url = title_link.get("href") 

65 

66 # Job ID 

67 jobid_pattern = r"[?&]jk=([a-zA-Z0-9]+)" 

68 ad_match = re.search(jobid_pattern, url, re.IGNORECASE) 

69 if not ad_match: 

70 redirected_url = get_indeed_redirected_url(url) 

71 else: 

72 redirected_url = url 

73 

74 matches = re.findall(jobid_pattern, redirected_url, re.IGNORECASE) 

75 if matches: 

76 job_id = matches[0] 

77 

78 # Company name 

79 company_table = section.find("table", {"role": "presentation"}) 

80 if company_table: 

81 company_tds = company_table.find_all("td", style=lambda x: x and "padding:0 12px 0 0" in x) 

82 if company_tds: 

83 company = " ".join(company_tds[0].get_text(strip=True).split()) 

84 

85 # Location 

86 all_tds = section.find_all("td", {"align": "left", "valign": "top"}) 

87 for td in all_tds: 

88 style = td.get("style", "") 

89 if "color:#2d2d2d;font-size:14px;line-height:21px" in style: 

90 text = td.get_text(strip=True) 

91 # Skip if it's the company (has rating sibling) or if it contains description text 

92 if len(text) < 50 and not td.find("table"): 

93 # Check if this might be location by seeing if it comes after company info 

94 location = text 

95 

96 # Salary 

97 salary_table = section.find("table", {"bgcolor": "#f3f2f1"}) 

98 if salary_table: 

99 salary_td = salary_table.find("td", style=lambda x: x and "padding:3px 8px 3px 8px" in x) 

100 if salary_td: 

101 salary_text = salary_td.get_text(strip=True) 

102 

103 # Pattern for salaries like £39,906 - £42,254 a year 

104 match = re.search(r"([£$€])([\d,]+)\s*-\s*([£$€])([\d,]+)\s*a\s*(\w+)", salary_text) 

105 if match: 

106 frequency = match.group(5) 

107 if frequency == "year": 

108 salary_currency = match.group(1) 

109 salary_min = process_salary(match.group(2)) 

110 salary_max = process_salary(match.group(4)) 

111 

112 # Pattern for single salary like £40,000 a year 

113 match = re.search(r"([£$€])([\d,]+)\s*a\s*(\w+)", salary_text) 

114 if match: 

115 frequency = match.group(3) 

116 if frequency == "a year": 

117 salary_currency = match.group(1) 

118 salary_min = salary_max = process_salary(match.group(2)) 

119 

120 if job_id is None: 

121 raise AssertionError(f"Job id not found for url {url}") 

122 processed_url = BASE_URL + job_id 

123 salary = Salary(min_amount=salary_min, max_amount=salary_max, currency=salary_currency) 

124 job_info = JobInfo(title=title, url=processed_url, salary=salary, raw_url=url) 

125 job_result = JobResult( 

126 company=company, job_id=job_id, location=location, job=job_info, platform=Platform.INDEED 

127 ) 

128 jobs.append(job_result) 

129 

130 return jobs 

131 

132 

133def extract_alert_name(title: str) -> str | None: 

134 """Parse Indeed email alert name from email title. 

135 :param str title: email title 

136 :return: email alert name or None if not found""" 

137 

138 # Pattern 1: "X more [job title] job" or "X new [job title] job" 

139 pattern1 = r"(?:\d+\s+(?:more|new)\s+)([\w\s&/\-]+?)\s+jobs?" 

140 

141 # Pattern 2: "+ X new [job title] jobs" (after "hiring for") 

142 pattern2 = r"\+\s+\d+\s+new\s+([\w\s&/\-]+?)\s+jobs?" 

143 

144 # Try pattern 2 first (more specific) 

145 match = re.search(pattern2, title, re.IGNORECASE) 

146 if match: 

147 return match.group(1).strip() 

148 

149 # Try pattern 1 

150 match = re.search(pattern1, title, re.IGNORECASE) 

151 if match: 

152 return match.group(1).strip() 

153 

154 return None