Coverage for backend / app / job_email_scraping / email_parsers / nhs.py: 96%
82 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-17 21:34 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-17 21:34 +0000
1"""NHS Jobs email parser"""
3import datetime as dt
4import html
5import re
7from bs4 import BeautifulSoup
9from app.job_email_scraping.email_parsers.utils import process_salary, Platform
10from app.job_email_scraping.schemas import Salary, JobInfo, JobResult
12BASE_URL = "https://beta.jobs.nhs.uk/candidate/jobadvert/"
15def parse_nhs_job_email(body: str) -> list[JobResult]:
16 """Parse NHS Jobs alert email and extract job information.
17 :param str body: email body
18 :return: list of JobResult objects containing job information"""
20 soup = BeautifulSoup(body, "html.parser", from_encoding="utf-8")
22 # Find all job sections (separated by <hr> tags)
23 # Get the main content area
24 content_td = soup.find("td", style=lambda value: value and "max-width:560px" in value)
26 if not content_td:
27 return []
29 jobs = []
31 # Find all job title links (they have the job URL)
32 job_links = content_td.find_all("a", href=re.compile(r"beta\.jobs\.nhs\.uk/candidate/jobadvert/"))
34 for job_link in job_links:
35 # Initialise variables
36 job_id = None
37 location = None
38 salary_currency = None
39 salary_min = None
40 salary_max = None
41 deadline = None
43 # Extract title and URL
44 title = " ".join(job_link.get_text(strip=True).split())
45 url = job_link.get("href", None)
47 # Extract job ID from URL
48 if url:
49 # NHS Jobs URLs are in format: /candidate/jobadvert/ALPHANUMERIC-ID
50 job_id_pattern = r"/candidate/jobadvert/([A-Za-z0-9\-]+)"
51 match = re.search(job_id_pattern, url)
52 if match:
53 job_id = match.group(1)
55 # Find the next table element which contains the job details
56 next_table = job_link.find_next("table", role="presentation")
58 if next_table:
59 # Extract details from list items
60 list_items = next_table.find_all("li")
62 for item in list_items:
63 text = item.get_text(strip=True)
65 # Extract closing date
66 if text.startswith("Closing Date:"):
67 if text.startswith("Closing Date:"):
68 date_str = text.replace("Closing Date:", "").strip()
69 try:
70 deadline = dt.datetime.strptime(date_str, "%d %b %Y")
71 except ValueError:
72 pass
74 # Extract location
75 elif text.startswith("Location:"):
76 location = text.replace("Location:", "").strip()
78 # Extract salary (Pay)
79 elif text.startswith("Pay:"):
80 salary_text = text.replace("Pay:", "").strip()
81 salary_pattern = r"([£$€])([\d,]+)\s+to\s+([£$€])([\d,]+)\s+a\s+(year|month|week|day|hour)"
82 match = re.search(salary_pattern, salary_text)
83 if match:
84 salary_frequency = match.group(5)
85 if salary_frequency == "year":
86 salary_currency = match.group(1)
87 salary_min = process_salary(match.group(2))
88 salary_max = process_salary(match.group(4))
90 # Get the raw HTML for this job (from the job link up to the next <hr>)
91 raw_html = str(job_link.find_parent())
92 if next_table:
93 raw_html += str(next_table)
95 processed_url = BASE_URL + job_id
96 # Create Pydantic objects
97 salary = Salary(min_amount=salary_min, max_amount=salary_max, currency=salary_currency)
98 job_info = JobInfo(title=title, url=processed_url, raw_url=url, salary=salary, deadline=deadline)
99 job_result = JobResult(company="NHS", job_id=job_id, location=location, job=job_info, platform=Platform.NHS)
100 jobs.append(job_result)
102 return jobs
105def extract_alert_name(body: str) -> str | None:
106 """Extract keywords and location from job alert settings HTML and concatenate them.
107 :param body: job alert body
108 :return: concatenated keywords and location or None if not found"""
110 soup = BeautifulSoup(body, "html.parser", from_encoding="utf-8")
112 # Find the heading with "Your job alert settings" text
113 heading = soup.find(string=re.compile(r"Your job alert settings", re.IGNORECASE))
114 if not heading:
115 return None
117 # Get the parent td element
118 content_td = heading.find_parent("td")
119 if not content_td:
120 return None
122 # Convert Tag to string for regex matching
123 content_str = html.unescape(str(content_td))
125 keywords = None
126 location = None
128 # Extract keywords
129 keywords_pattern = r"Your keywords:\s*([^<]+)"
130 keywords_match = re.search(keywords_pattern, content_str, re.IGNORECASE)
131 if keywords_match:
132 keywords = keywords_match.group(1).strip()
134 # Extract location
135 location_pattern = r"Your location:\s*([^<]+)"
136 location_match = re.search(location_pattern, content_str, re.IGNORECASE)
137 if location_match:
138 location = location_match.group(1).strip()
140 # Concatenate with space
141 parts = [part for part in [keywords, location] if part]
142 return " ".join(parts) if parts else None