Coverage for backend/app/job_email_scraping/email

1"""NHS Jobs email parser"""

3import datetime as dt

4import html

5import re

7from bs4 import BeautifulSoup

9from app.job_email_scraping.email_parsers.utils import process_salary, Platform

10from app.job_email_scraping.schemas import Salary, JobInfo, JobResult

12BASE_URL = "https://beta.jobs.nhs.uk/candidate/jobadvert/"

15def parse_nhs_job_email(body: str) -> list[JobResult]:

16 """Parse NHS Jobs alert email and extract job information.

17 :param str body: email body

18 :return: list of JobResult objects containing job information"""

20 soup = BeautifulSoup(body, "html.parser", from_encoding="utf-8")

22 # Find all job sections (separated by <hr> tags)

23 # Get the main content area

24 content_td = soup.find("td", style=lambda value: value and "max-width:560px" in value)

26 if not content_td:

27 return []

29 jobs = []

31 # Find all job title links (they have the job URL)

32 job_links = content_td.find_all("a", href=re.compile(r"beta\.jobs\.nhs\.uk/candidate/jobadvert/"))

34 for job_link in job_links:

35 # Initialise variables

36 job_id = None

37 location = None

38 salary_currency = None

39 salary_min = None

40 salary_max = None

41 deadline = None

43 # Extract title and URL

44 title = " ".join(job_link.get_text(strip=True).split())

45 url = job_link.get("href", None)

47 # Extract job ID from URL

48 if url:

49 # NHS Jobs URLs are in format: /candidate/jobadvert/ALPHANUMERIC-ID

50 job_id_pattern = r"/candidate/jobadvert/([A-Za-z0-9\-]+)"

51 match = re.search(job_id_pattern, url)

52 if match:

53 job_id = match.group(1)

55 # Find the next table element which contains the job details

56 next_table = job_link.find_next("table", role="presentation")

58 if next_table:

59 # Extract details from list items

60 list_items = next_table.find_all("li")

62 for item in list_items:

63 text = item.get_text(strip=True)

65 # Extract closing date

66 if text.startswith("Closing Date:"):

67 if text.startswith("Closing Date:"):

68 date_str = text.replace("Closing Date:", "").strip()

69 try:

70 deadline = dt.datetime.strptime(date_str, "%d %b %Y")

71 except ValueError:

72 pass

74 # Extract location

75 elif text.startswith("Location:"):

76 location = text.replace("Location:", "").strip()

78 # Extract salary (Pay)

79 elif text.startswith("Pay:"):

80 salary_text = text.replace("Pay:", "").strip()

81 salary_pattern = r"([£$€])([\d,]+)\s+to\s+([£$€])([\d,]+)\s+a\s+(year|month|week|day|hour)"

82 match = re.search(salary_pattern, salary_text)

83 if match:

84 salary_frequency = match.group(5)

85 if salary_frequency == "year":

86 salary_currency = match.group(1)

87 salary_min = process_salary(match.group(2))

88 salary_max = process_salary(match.group(4))

90 # Get the raw HTML for this job (from the job link up to the next <hr>)

91 raw_html = str(job_link.find_parent())

92 if next_table:

93 raw_html += str(next_table)

95 processed_url = BASE_URL + job_id

96 # Create Pydantic objects

97 salary = Salary(min_amount=salary_min, max_amount=salary_max, currency=salary_currency)

98 job_info = JobInfo(title=title, url=processed_url, raw_url=url, salary=salary, deadline=deadline)

99 job_result = JobResult(company="NHS", job_id=job_id, location=location, job=job_info, platform=Platform.NHS)

100 jobs.append(job_result)

101

102 return jobs

103

104

105def extract_alert_name(body: str) -> str | None:

106 """Extract keywords and location from job alert settings HTML and concatenate them.

107 :param body: job alert body

108 :return: concatenated keywords and location or None if not found"""

109

110 soup = BeautifulSoup(body, "html.parser", from_encoding="utf-8")

111

112 # Find the heading with "Your job alert settings" text

113 heading = soup.find(string=re.compile(r"Your job alert settings", re.IGNORECASE))

114 if not heading:

115 return None

116

117 # Get the parent td element

118 content_td = heading.find_parent("td")

119 if not content_td:

120 return None

121

122 # Convert Tag to string for regex matching

123 content_str = html.unescape(str(content_td))

124

125 keywords = None

126 location = None

127

128 # Extract keywords

129 keywords_pattern = r"Your keywords:\s*([^<]+)"

130 keywords_match = re.search(keywords_pattern, content_str, re.IGNORECASE)

131 if keywords_match:

132 keywords = keywords_match.group(1).strip()

133

134 # Extract location

135 location_pattern = r"Your location:\s*([^<]+)"

136 location_match = re.search(location_pattern, content_str, re.IGNORECASE)

137 if location_match:

138 location = location_match.group(1).strip()

139

140 # Concatenate with space

141 parts = [part for part in [keywords, location] if part]

142 return " ".join(parts) if parts else None

Coverage for backend / app / job_email_scraping / email_parsers / nhs.py: 96%

82 statements