Coverage for backend / app / job_email_scraping / email_parsers / nhs.py: 96%

82 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-17 21:34 +0000

1"""NHS Jobs email parser""" 

2 

3import datetime as dt 

4import html 

5import re 

6 

7from bs4 import BeautifulSoup 

8 

9from app.job_email_scraping.email_parsers.utils import process_salary, Platform 

10from app.job_email_scraping.schemas import Salary, JobInfo, JobResult 

11 

12BASE_URL = "https://beta.jobs.nhs.uk/candidate/jobadvert/" 

13 

14 

15def parse_nhs_job_email(body: str) -> list[JobResult]: 

16 """Parse NHS Jobs alert email and extract job information. 

17 :param str body: email body 

18 :return: list of JobResult objects containing job information""" 

19 

20 soup = BeautifulSoup(body, "html.parser", from_encoding="utf-8") 

21 

22 # Find all job sections (separated by <hr> tags) 

23 # Get the main content area 

24 content_td = soup.find("td", style=lambda value: value and "max-width:560px" in value) 

25 

26 if not content_td: 

27 return [] 

28 

29 jobs = [] 

30 

31 # Find all job title links (they have the job URL) 

32 job_links = content_td.find_all("a", href=re.compile(r"beta\.jobs\.nhs\.uk/candidate/jobadvert/")) 

33 

34 for job_link in job_links: 

35 # Initialise variables 

36 job_id = None 

37 location = None 

38 salary_currency = None 

39 salary_min = None 

40 salary_max = None 

41 deadline = None 

42 

43 # Extract title and URL 

44 title = " ".join(job_link.get_text(strip=True).split()) 

45 url = job_link.get("href", None) 

46 

47 # Extract job ID from URL 

48 if url: 

49 # NHS Jobs URLs are in format: /candidate/jobadvert/ALPHANUMERIC-ID 

50 job_id_pattern = r"/candidate/jobadvert/([A-Za-z0-9\-]+)" 

51 match = re.search(job_id_pattern, url) 

52 if match: 

53 job_id = match.group(1) 

54 

55 # Find the next table element which contains the job details 

56 next_table = job_link.find_next("table", role="presentation") 

57 

58 if next_table: 

59 # Extract details from list items 

60 list_items = next_table.find_all("li") 

61 

62 for item in list_items: 

63 text = item.get_text(strip=True) 

64 

65 # Extract closing date 

66 if text.startswith("Closing Date:"): 

67 if text.startswith("Closing Date:"): 

68 date_str = text.replace("Closing Date:", "").strip() 

69 try: 

70 deadline = dt.datetime.strptime(date_str, "%d %b %Y") 

71 except ValueError: 

72 pass 

73 

74 # Extract location 

75 elif text.startswith("Location:"): 

76 location = text.replace("Location:", "").strip() 

77 

78 # Extract salary (Pay) 

79 elif text.startswith("Pay:"): 

80 salary_text = text.replace("Pay:", "").strip() 

81 salary_pattern = r"([£$€])([\d,]+)\s+to\s+([£$€])([\d,]+)\s+a\s+(year|month|week|day|hour)" 

82 match = re.search(salary_pattern, salary_text) 

83 if match: 

84 salary_frequency = match.group(5) 

85 if salary_frequency == "year": 

86 salary_currency = match.group(1) 

87 salary_min = process_salary(match.group(2)) 

88 salary_max = process_salary(match.group(4)) 

89 

90 # Get the raw HTML for this job (from the job link up to the next <hr>) 

91 raw_html = str(job_link.find_parent()) 

92 if next_table: 

93 raw_html += str(next_table) 

94 

95 processed_url = BASE_URL + job_id 

96 # Create Pydantic objects 

97 salary = Salary(min_amount=salary_min, max_amount=salary_max, currency=salary_currency) 

98 job_info = JobInfo(title=title, url=processed_url, raw_url=url, salary=salary, deadline=deadline) 

99 job_result = JobResult(company="NHS", job_id=job_id, location=location, job=job_info, platform=Platform.NHS) 

100 jobs.append(job_result) 

101 

102 return jobs 

103 

104 

105def extract_alert_name(body: str) -> str | None: 

106 """Extract keywords and location from job alert settings HTML and concatenate them. 

107 :param body: job alert body 

108 :return: concatenated keywords and location or None if not found""" 

109 

110 soup = BeautifulSoup(body, "html.parser", from_encoding="utf-8") 

111 

112 # Find the heading with "Your job alert settings" text 

113 heading = soup.find(string=re.compile(r"Your job alert settings", re.IGNORECASE)) 

114 if not heading: 

115 return None 

116 

117 # Get the parent td element 

118 content_td = heading.find_parent("td") 

119 if not content_td: 

120 return None 

121 

122 # Convert Tag to string for regex matching 

123 content_str = html.unescape(str(content_td)) 

124 

125 keywords = None 

126 location = None 

127 

128 # Extract keywords 

129 keywords_pattern = r"Your keywords:\s*([^<]+)" 

130 keywords_match = re.search(keywords_pattern, content_str, re.IGNORECASE) 

131 if keywords_match: 

132 keywords = keywords_match.group(1).strip() 

133 

134 # Extract location 

135 location_pattern = r"Your location:\s*([^<]+)" 

136 location_match = re.search(location_pattern, content_str, re.IGNORECASE) 

137 if location_match: 

138 location = location_match.group(1).strip() 

139 

140 # Concatenate with space 

141 parts = [part for part in [keywords, location] if part] 

142 return " ".join(parts) if parts else None