Coverage for backend/app/eis/location_parser.py: 100%
90 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-09-22 15:38 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-09-22 15:38 +0000
1"""Location Parser Module
3Extracts and parses location components from job posting location strings.
4Handles postcodes, cities, countries, and attendance type indicators across
5multiple international formats including UK, US, and Canadian postal codes."""
7import re
9from app.schemas import LocationCreate
12class LocationParser:
13 """Parser for extracting location components from job location strings"""
15 def __init__(self) -> None:
17 # Common country names and variations
18 self.countries = {
19 "uk",
20 "united kingdom",
21 "britain",
22 "great britain",
23 "england",
24 "scotland",
25 "wales",
26 "northern ireland",
27 "usa",
28 "united states",
29 "united states of america",
30 "america",
31 "us",
32 "canada",
33 "australia",
34 "germany",
35 "france",
36 "italy",
37 "spain",
38 "netherlands",
39 "belgium",
40 "ireland",
41 "switzerland",
42 "austria",
43 "sweden",
44 "norway",
45 "denmark",
46 "finland",
47 "poland",
48 "czech republic",
49 "hungary",
50 "portugal",
51 "greece",
52 "turkey",
53 "india",
54 "china",
55 "japan",
56 "singapore",
57 "hong kong",
58 "south korea",
59 "brazil",
60 "mexico",
61 "argentina",
62 "chile",
63 "colombia",
64 }
66 # UK postcode pattern
67 self.uk_postcode_pattern = r"\b[A-Z]{1,2}[0-9][A-Z0-9]?\s?[0-9][A-Z]{2}\b"
69 # US zip code pattern
70 self.us_zipcode_pattern = r"\b\d{5}(?:-\d{4})?\b"
72 # Canadian postal code pattern
73 self.ca_postcode_pattern = r"\b[A-Z]\d[A-Z]\s?\d[A-Z]\d\b"
75 # General postcode patterns for other countries
76 self.general_postcode_patterns = [
77 r"\b\d{4,6}\b", # 4-6 digit postcodes
78 r"\b[A-Z]{2}-?\d{3,5}\b", # Letter-number combinations
79 ]
81 # Attendance type indicators
82 self.attendance_indicators = {
83 "remote": ["remote", "work from home", "wfh", "anywhere", "global", "fully remote"],
84 "hybrid": ["hybrid", "flexible"],
85 "on-site": ["on-site", "office", "in-person", "on site", "onsite"],
86 }
88 def extract_postcode(self, location_str: str) -> str | None:
89 """Extract postcode from the location string
90 :param location_str: Raw location string from the job posting
91 :return: Postcode string if found, else None"""
93 location_upper = location_str.upper()
95 # Try UK postcode first
96 uk_match = re.search(self.uk_postcode_pattern, location_upper)
97 if uk_match:
98 return uk_match.group().strip()
100 # Try US zip code
101 us_match = re.search(self.us_zipcode_pattern, location_str)
102 if us_match:
103 return us_match.group().strip()
105 # Try Canadian postal code
106 ca_match = re.search(self.ca_postcode_pattern, location_upper)
107 if ca_match:
108 return ca_match.group().strip()
110 # Try general patterns
111 for pattern in self.general_postcode_patterns:
112 match = re.search(pattern, location_upper)
113 if match:
114 return match.group().strip()
116 return None
118 def extract_attendance_type(self, location_str: str) -> str | None:
119 """Extract attendance type from the location string
120 :param location_str: Raw location string from the job posting
121 :return: Attendance type ("remote", "hybrid", "on-site") if found, else None"""
123 location_lower = location_str.lower()
125 # Check if both remote and office/on-site indicators are present -> hybrid
126 has_remote = any(indicator in location_lower for indicator in self.attendance_indicators["remote"])
127 has_office = any(indicator in location_lower for indicator in self.attendance_indicators["on-site"])
129 if has_remote and has_office:
130 return "hybrid"
132 # Check for explicit hybrid indicators
133 for indicator in self.attendance_indicators["hybrid"]:
134 if indicator in location_lower:
135 return "hybrid"
137 # Check remote indicators
138 if has_remote:
139 return "remote"
141 # Check on-site indicators
142 if has_office:
143 return "on-site"
145 return None
147 def extract_country_with_match(self, location_str: str) -> tuple[str | None, str | None]:
148 """Extract country from the location string and return both the standardised name and the matched text
149 :param location_str: Raw location string from the job posting
150 :return: Standardised country name and the original name or (None, None) if not found"""
152 location_lower = location_str.lower().strip()
154 # Sort countries by length (descending) to match longer names first
155 sorted_countries = sorted(self.countries, key=len, reverse=True)
157 # Direct country match using word boundaries
158 for country in sorted_countries:
159 pattern = r"\b" + re.escape(country) + r"\b"
160 if re.search(pattern, location_lower):
161 # Return the standardised country name and the matched variant
162 if country in [
163 "uk",
164 "united kingdom",
165 "britain",
166 "great britain",
167 "england",
168 "scotland",
169 "wales",
170 "northern ireland",
171 ]:
172 return "United Kingdom", country
173 elif country in ["usa", "united states", "united states of america", "america", "us"]:
174 return "United States", country
175 else:
176 return country.title(), country
178 return None, None
180 def parse_location(self, location_str: str) -> tuple[LocationCreate, str | None]:
181 """Parse a location string and extract country, city, postcode, and attendance type
182 :param location_str: Raw location string from the job posting
183 :return: Tuple of (LocationCreate object, attendance_type string or None)"""
185 location_str = location_str.strip()
187 if not location_str:
188 return LocationCreate(), None
190 # Extract attendance type first
191 attendance_type = self.extract_attendance_type(location_str)
193 # Create a working copy of the string for location parsing
194 working_str = location_str
196 # Remove attendance type indicators from the working string for cleaner location parsing
197 if attendance_type:
198 for indicator_list in self.attendance_indicators.values():
199 for indicator in indicator_list:
200 # Remove the indicator and clean up whitespace/punctuation
201 pattern = r"\b" + re.escape(indicator) + r"\b"
202 working_str = re.sub(pattern, "", working_str, flags=re.IGNORECASE)
204 # Clean up the working string
205 working_str = re.sub(r"\s*[-,;|]\s*", " ", working_str).strip()
206 working_str = re.sub(r"\s+", " ", working_str) # Normalize whitespace
208 # If the string is now empty or just punctuation, we only have attendance type info
209 if not working_str or re.match(r"^\W*$", working_str):
210 return LocationCreate(), attendance_type
212 # Extract postcode first (as it's most specific)
213 postcode = self.extract_postcode(working_str)
214 if postcode:
215 working_str = re.sub(re.escape(postcode), "", working_str, flags=re.IGNORECASE).strip()
217 # Extract country and handle the matched country text
218 country, original = self.extract_country_with_match(working_str)
219 if country and original:
220 pattern = r"\b" + re.escape(original) + r"\b"
221 working_str = re.sub(pattern, "", working_str, flags=re.IGNORECASE).strip()
223 # Clean up the remaining string (remove common separators)
224 working_str = re.sub(r"[,;|\-]+", ",", working_str).strip(" ,")
226 # Remove common prepositions and articles that shouldn't be city names
227 prepositions_and_articles = [
228 "from",
229 "in",
230 "at",
231 "to",
232 "for",
233 "with",
234 "by",
235 "of",
236 "the",
237 "a",
238 "an",
239 "and",
240 "or",
241 "but",
242 ]
244 for word in prepositions_and_articles:
245 pattern = r"\b" + re.escape(word) + r"\b"
246 working_str = re.sub(pattern, "", working_str, flags=re.IGNORECASE)
248 # Clean up whitespace and separators again after removing prepositions
249 working_str = re.sub(r"\s*[-,;|]\s*", ",", working_str).strip(" ,")
250 working_str = re.sub(r"\s+", " ", working_str).strip()
252 # Split remaining parts by comma
253 parts = [part.strip() for part in working_str.split(",") if part.strip()]
255 # Assign remaining parts as city
256 city = None
257 if len(parts) >= 1 and parts[0]:
258 city = parts[0].title()
260 location = LocationCreate(country=country, city=city, postcode=postcode)
261 return location, attendance_type
263 def parse_location_only(self, location_str: str) -> LocationCreate:
264 """Parse a location string and return only the location data (for backward compatibility)
265 :param location_str: Raw location string from the job posting
266 :return: Location schema object with parsed components"""
268 location, _ = self.parse_location(location_str)
269 return location