Coverage for backend/app/eis/location_parser.py: 100%

90 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-09-22 15:38 +0000

1"""Location Parser Module 

2 

3Extracts and parses location components from job posting location strings. 

4Handles postcodes, cities, countries, and attendance type indicators across 

5multiple international formats including UK, US, and Canadian postal codes.""" 

6 

7import re 

8 

9from app.schemas import LocationCreate 

10 

11 

12class LocationParser: 

13 """Parser for extracting location components from job location strings""" 

14 

15 def __init__(self) -> None: 

16 

17 # Common country names and variations 

18 self.countries = { 

19 "uk", 

20 "united kingdom", 

21 "britain", 

22 "great britain", 

23 "england", 

24 "scotland", 

25 "wales", 

26 "northern ireland", 

27 "usa", 

28 "united states", 

29 "united states of america", 

30 "america", 

31 "us", 

32 "canada", 

33 "australia", 

34 "germany", 

35 "france", 

36 "italy", 

37 "spain", 

38 "netherlands", 

39 "belgium", 

40 "ireland", 

41 "switzerland", 

42 "austria", 

43 "sweden", 

44 "norway", 

45 "denmark", 

46 "finland", 

47 "poland", 

48 "czech republic", 

49 "hungary", 

50 "portugal", 

51 "greece", 

52 "turkey", 

53 "india", 

54 "china", 

55 "japan", 

56 "singapore", 

57 "hong kong", 

58 "south korea", 

59 "brazil", 

60 "mexico", 

61 "argentina", 

62 "chile", 

63 "colombia", 

64 } 

65 

66 # UK postcode pattern 

67 self.uk_postcode_pattern = r"\b[A-Z]{1,2}[0-9][A-Z0-9]?\s?[0-9][A-Z]{2}\b" 

68 

69 # US zip code pattern 

70 self.us_zipcode_pattern = r"\b\d{5}(?:-\d{4})?\b" 

71 

72 # Canadian postal code pattern 

73 self.ca_postcode_pattern = r"\b[A-Z]\d[A-Z]\s?\d[A-Z]\d\b" 

74 

75 # General postcode patterns for other countries 

76 self.general_postcode_patterns = [ 

77 r"\b\d{4,6}\b", # 4-6 digit postcodes 

78 r"\b[A-Z]{2}-?\d{3,5}\b", # Letter-number combinations 

79 ] 

80 

81 # Attendance type indicators 

82 self.attendance_indicators = { 

83 "remote": ["remote", "work from home", "wfh", "anywhere", "global", "fully remote"], 

84 "hybrid": ["hybrid", "flexible"], 

85 "on-site": ["on-site", "office", "in-person", "on site", "onsite"], 

86 } 

87 

88 def extract_postcode(self, location_str: str) -> str | None: 

89 """Extract postcode from the location string 

90 :param location_str: Raw location string from the job posting 

91 :return: Postcode string if found, else None""" 

92 

93 location_upper = location_str.upper() 

94 

95 # Try UK postcode first 

96 uk_match = re.search(self.uk_postcode_pattern, location_upper) 

97 if uk_match: 

98 return uk_match.group().strip() 

99 

100 # Try US zip code 

101 us_match = re.search(self.us_zipcode_pattern, location_str) 

102 if us_match: 

103 return us_match.group().strip() 

104 

105 # Try Canadian postal code 

106 ca_match = re.search(self.ca_postcode_pattern, location_upper) 

107 if ca_match: 

108 return ca_match.group().strip() 

109 

110 # Try general patterns 

111 for pattern in self.general_postcode_patterns: 

112 match = re.search(pattern, location_upper) 

113 if match: 

114 return match.group().strip() 

115 

116 return None 

117 

118 def extract_attendance_type(self, location_str: str) -> str | None: 

119 """Extract attendance type from the location string 

120 :param location_str: Raw location string from the job posting 

121 :return: Attendance type ("remote", "hybrid", "on-site") if found, else None""" 

122 

123 location_lower = location_str.lower() 

124 

125 # Check if both remote and office/on-site indicators are present -> hybrid 

126 has_remote = any(indicator in location_lower for indicator in self.attendance_indicators["remote"]) 

127 has_office = any(indicator in location_lower for indicator in self.attendance_indicators["on-site"]) 

128 

129 if has_remote and has_office: 

130 return "hybrid" 

131 

132 # Check for explicit hybrid indicators 

133 for indicator in self.attendance_indicators["hybrid"]: 

134 if indicator in location_lower: 

135 return "hybrid" 

136 

137 # Check remote indicators 

138 if has_remote: 

139 return "remote" 

140 

141 # Check on-site indicators 

142 if has_office: 

143 return "on-site" 

144 

145 return None 

146 

147 def extract_country_with_match(self, location_str: str) -> tuple[str | None, str | None]: 

148 """Extract country from the location string and return both the standardised name and the matched text 

149 :param location_str: Raw location string from the job posting 

150 :return: Standardised country name and the original name or (None, None) if not found""" 

151 

152 location_lower = location_str.lower().strip() 

153 

154 # Sort countries by length (descending) to match longer names first 

155 sorted_countries = sorted(self.countries, key=len, reverse=True) 

156 

157 # Direct country match using word boundaries 

158 for country in sorted_countries: 

159 pattern = r"\b" + re.escape(country) + r"\b" 

160 if re.search(pattern, location_lower): 

161 # Return the standardised country name and the matched variant 

162 if country in [ 

163 "uk", 

164 "united kingdom", 

165 "britain", 

166 "great britain", 

167 "england", 

168 "scotland", 

169 "wales", 

170 "northern ireland", 

171 ]: 

172 return "United Kingdom", country 

173 elif country in ["usa", "united states", "united states of america", "america", "us"]: 

174 return "United States", country 

175 else: 

176 return country.title(), country 

177 

178 return None, None 

179 

180 def parse_location(self, location_str: str) -> tuple[LocationCreate, str | None]: 

181 """Parse a location string and extract country, city, postcode, and attendance type 

182 :param location_str: Raw location string from the job posting 

183 :return: Tuple of (LocationCreate object, attendance_type string or None)""" 

184 

185 location_str = location_str.strip() 

186 

187 if not location_str: 

188 return LocationCreate(), None 

189 

190 # Extract attendance type first 

191 attendance_type = self.extract_attendance_type(location_str) 

192 

193 # Create a working copy of the string for location parsing 

194 working_str = location_str 

195 

196 # Remove attendance type indicators from the working string for cleaner location parsing 

197 if attendance_type: 

198 for indicator_list in self.attendance_indicators.values(): 

199 for indicator in indicator_list: 

200 # Remove the indicator and clean up whitespace/punctuation 

201 pattern = r"\b" + re.escape(indicator) + r"\b" 

202 working_str = re.sub(pattern, "", working_str, flags=re.IGNORECASE) 

203 

204 # Clean up the working string 

205 working_str = re.sub(r"\s*[-,;|]\s*", " ", working_str).strip() 

206 working_str = re.sub(r"\s+", " ", working_str) # Normalize whitespace 

207 

208 # If the string is now empty or just punctuation, we only have attendance type info 

209 if not working_str or re.match(r"^\W*$", working_str): 

210 return LocationCreate(), attendance_type 

211 

212 # Extract postcode first (as it's most specific) 

213 postcode = self.extract_postcode(working_str) 

214 if postcode: 

215 working_str = re.sub(re.escape(postcode), "", working_str, flags=re.IGNORECASE).strip() 

216 

217 # Extract country and handle the matched country text 

218 country, original = self.extract_country_with_match(working_str) 

219 if country and original: 

220 pattern = r"\b" + re.escape(original) + r"\b" 

221 working_str = re.sub(pattern, "", working_str, flags=re.IGNORECASE).strip() 

222 

223 # Clean up the remaining string (remove common separators) 

224 working_str = re.sub(r"[,;|\-]+", ",", working_str).strip(" ,") 

225 

226 # Remove common prepositions and articles that shouldn't be city names 

227 prepositions_and_articles = [ 

228 "from", 

229 "in", 

230 "at", 

231 "to", 

232 "for", 

233 "with", 

234 "by", 

235 "of", 

236 "the", 

237 "a", 

238 "an", 

239 "and", 

240 "or", 

241 "but", 

242 ] 

243 

244 for word in prepositions_and_articles: 

245 pattern = r"\b" + re.escape(word) + r"\b" 

246 working_str = re.sub(pattern, "", working_str, flags=re.IGNORECASE) 

247 

248 # Clean up whitespace and separators again after removing prepositions 

249 working_str = re.sub(r"\s*[-,;|]\s*", ",", working_str).strip(" ,") 

250 working_str = re.sub(r"\s+", " ", working_str).strip() 

251 

252 # Split remaining parts by comma 

253 parts = [part.strip() for part in working_str.split(",") if part.strip()] 

254 

255 # Assign remaining parts as city 

256 city = None 

257 if len(parts) >= 1 and parts[0]: 

258 city = parts[0].title() 

259 

260 location = LocationCreate(country=country, city=city, postcode=postcode) 

261 return location, attendance_type 

262 

263 def parse_location_only(self, location_str: str) -> LocationCreate: 

264 """Parse a location string and return only the location data (for backward compatibility) 

265 :param location_str: Raw location string from the job posting 

266 :return: Location schema object with parsed components""" 

267 

268 location, _ = self.parse_location(location_str) 

269 return location