Coverage for backend / app / job_email_scraping / email_parsers / indeed.py: 86%
92 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-17 21:34 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-17 21:34 +0000
1"""Indeed job alert email parser."""
3import re
5import cloudscraper
6from bs4 import BeautifulSoup
8from app.job_email_scraping.email_parsers.utils import process_salary, Platform
9from app.job_email_scraping.schemas import Salary, JobInfo, JobResult
11BASE_URL = "https://www.indeed.com/viewjob?jk="
14def get_indeed_redirected_url(job_url: str, max_attempts: int = 100) -> str:
15 """Get the redirected URL from the Indeed job URL
16 :param job_url: Indeed job URL
17 :param max_attempts: max number of attempts to get redirected
18 :return: redirected URL"""
20 iteration = 0
21 url = job_url
22 while "indeed.com/viewjob?jk" not in url:
23 scraper = cloudscraper.create_scraper()
24 response = scraper.get(job_url, allow_redirects=True)
25 url = response.url
26 iteration += 1
27 if iteration > max_attempts:
28 raise AssertionError(f"Too many attempts to get redirect for url {job_url}")
29 return url
32def parse_indeed_job_email(body: str) -> list[JobResult]:
33 """Parse Indeed job alert email and extract job information.
34 :param str body: path to the HTML file
35 :return: list of dictionaries containing job information"""
37 # Parse with BeautifulSoup
38 soup = BeautifulSoup(body, "html.parser", from_encoding="utf-8")
40 # Find all job sections (each job is in a td with class 'pb-24')
41 job_sections = soup.find_all("td", {"class": "pb-24"})
43 jobs = []
45 for section in job_sections:
46 # Initialise variables
47 title = None
48 url = None
49 job_id = None
50 company = None
51 location = None
52 salary_currency = None
53 salary_min = None
54 salary_max = None
56 # Extract job title and URL
57 title_h2 = section.find("h2")
58 if title_h2:
59 title_link = title_h2.find("a", class_="strong-text-link")
60 if title_link:
61 title = " ".join(title_link.get_text(strip=True).split())
63 # URL
64 url = title_link.get("href")
66 # Job ID
67 jobid_pattern = r"[?&]jk=([a-zA-Z0-9]+)"
68 ad_match = re.search(jobid_pattern, url, re.IGNORECASE)
69 if not ad_match:
70 redirected_url = get_indeed_redirected_url(url)
71 else:
72 redirected_url = url
74 matches = re.findall(jobid_pattern, redirected_url, re.IGNORECASE)
75 if matches:
76 job_id = matches[0]
78 # Company name
79 company_table = section.find("table", {"role": "presentation"})
80 if company_table:
81 company_tds = company_table.find_all("td", style=lambda x: x and "padding:0 12px 0 0" in x)
82 if company_tds:
83 company = " ".join(company_tds[0].get_text(strip=True).split())
85 # Location
86 all_tds = section.find_all("td", {"align": "left", "valign": "top"})
87 for td in all_tds:
88 style = td.get("style", "")
89 if "color:#2d2d2d;font-size:14px;line-height:21px" in style:
90 text = td.get_text(strip=True)
91 # Skip if it's the company (has rating sibling) or if it contains description text
92 if len(text) < 50 and not td.find("table"):
93 # Check if this might be location by seeing if it comes after company info
94 location = text
96 # Salary
97 salary_table = section.find("table", {"bgcolor": "#f3f2f1"})
98 if salary_table:
99 salary_td = salary_table.find("td", style=lambda x: x and "padding:3px 8px 3px 8px" in x)
100 if salary_td:
101 salary_text = salary_td.get_text(strip=True)
103 # Pattern for salaries like £39,906 - £42,254 a year
104 match = re.search(r"([£$€])([\d,]+)\s*-\s*([£$€])([\d,]+)\s*a\s*(\w+)", salary_text)
105 if match:
106 frequency = match.group(5)
107 if frequency == "year":
108 salary_currency = match.group(1)
109 salary_min = process_salary(match.group(2))
110 salary_max = process_salary(match.group(4))
112 # Pattern for single salary like £40,000 a year
113 match = re.search(r"([£$€])([\d,]+)\s*a\s*(\w+)", salary_text)
114 if match:
115 frequency = match.group(3)
116 if frequency == "a year":
117 salary_currency = match.group(1)
118 salary_min = salary_max = process_salary(match.group(2))
120 if job_id is None:
121 raise AssertionError(f"Job id not found for url {url}")
122 processed_url = BASE_URL + job_id
123 salary = Salary(min_amount=salary_min, max_amount=salary_max, currency=salary_currency)
124 job_info = JobInfo(title=title, url=processed_url, salary=salary, raw_url=url)
125 job_result = JobResult(
126 company=company, job_id=job_id, location=location, job=job_info, platform=Platform.INDEED
127 )
128 jobs.append(job_result)
130 return jobs
133def extract_alert_name(title: str) -> str | None:
134 """Parse Indeed email alert name from email title.
135 :param str title: email title
136 :return: email alert name or None if not found"""
138 # Pattern 1: "X more [job title] job" or "X new [job title] job"
139 pattern1 = r"(?:\d+\s+(?:more|new)\s+)([\w\s&/\-]+?)\s+jobs?"
141 # Pattern 2: "+ X new [job title] jobs" (after "hiring for")
142 pattern2 = r"\+\s+\d+\s+new\s+([\w\s&/\-]+?)\s+jobs?"
144 # Try pattern 2 first (more specific)
145 match = re.search(pattern2, title, re.IGNORECASE)
146 if match:
147 return match.group(1).strip()
149 # Try pattern 1
150 match = re.search(pattern1, title, re.IGNORECASE)
151 if match:
152 return match.group(1).strip()
154 return None