Coverage for backend / app / job_email_scraping / location_parser.py: 100%
51 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-17 21:34 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-17 21:34 +0000
1"""Location Parser Module
3Extracts attendance type from job posting location strings and returns
4a cleaned location string with attendance indicators removed."""
6import re
9class LocationParser:
10 """Parser for extracting attendance type from job location strings"""
12 def __init__(self) -> None:
13 # Attendance type indicators
14 self.attendance_indicators = {
15 "remote": ["remote", "work from home", "wfh", "anywhere", "global", "fully remote"],
16 "hybrid": ["hybrid", "flexible"],
17 "on-site": ["on-site", "office", "in-person", "on site", "onsite"],
18 }
20 def extract_attendance_type(self, location_str: str) -> str | None:
21 """Extract attendance type from the location string
22 :param location_str: Raw location string from the job posting
23 :return: Attendance type ("remote", "hybrid", "on-site") if found, else None"""
25 location_lower = location_str.lower()
27 # Check if both remote and office/on-site indicators are present -> hybrid
28 has_remote = any(indicator in location_lower for indicator in self.attendance_indicators["remote"])
29 has_office = any(indicator in location_lower for indicator in self.attendance_indicators["on-site"])
31 if has_remote and has_office:
32 return "hybrid"
34 # Check for explicit hybrid indicators
35 for indicator in self.attendance_indicators["hybrid"]:
36 if indicator in location_lower:
37 return "hybrid"
39 # Check remote indicators
40 if has_remote:
41 return "remote"
43 # Check on-site indicators
44 if has_office:
45 return "on-site"
47 return None
49 def remove_attendance_indicators(self, location_str: str) -> str:
50 """Remove attendance type indicators from location string
51 :param location_str: Raw location string
52 :return: Cleaned location string with attendance indicators removed"""
54 cleaned = location_str
56 # Collect all indicators to remove
57 all_indicators = []
58 for indicators in self.attendance_indicators.values():
59 all_indicators.extend(indicators)
61 # Sort by length (longest first) to avoid partial matches
62 all_indicators.sort(key=len, reverse=True)
64 # Remove each indicator (case-insensitive)
65 for indicator in all_indicators:
66 pattern = re.compile(re.escape(indicator), re.IGNORECASE)
67 cleaned = pattern.sub("", cleaned)
69 # Remove common conjunctions/connectors used between attendance types
70 connectors = [r"\bor\b", r"\band\b", r"\bor/and\b", r"\b&\b"]
71 for connector in connectors:
72 cleaned = re.sub(connector, "", cleaned, flags=re.IGNORECASE)
74 # Remove empty parentheses, brackets, and braces
75 cleaned = re.sub(r"\(\s*\)", "", cleaned) # Empty ()
76 cleaned = re.sub(r"\[\s*]", "", cleaned) # Empty []
77 cleaned = re.sub(r"\{\s*}", "", cleaned) # Empty {}
79 # Clean up extra whitespace, commas, and separators
80 cleaned = re.sub(r"\s*[,/|•·-]\s*", ", ", cleaned) # Normalize separators
81 cleaned = re.sub(r"\s+", " ", cleaned) # Multiple spaces to single
82 cleaned = re.sub(r"^[,\s]+|[,\s]+$", "", cleaned) # Trim leading/trailing
83 cleaned = re.sub(r",+", ",", cleaned) # Multiple commas to single
84 cleaned = re.sub(r",\s*,", ",", cleaned) # Remove duplicate commas with spaces
86 # If the result is only wrapped in parentheses/brackets, unwrap it
87 cleaned = re.sub(r"^\(\s*(.*?)\s*\)$", r"\1", cleaned) # (content) -> content
88 cleaned = re.sub(r"^\[\s*(.*?)\s*]$", r"\1", cleaned) # [content] -> content
89 cleaned = re.sub(r"^\{\s*(.*?)\s*}$", r"\1", cleaned) # {content} -> content
91 return cleaned.strip()
93 def parse_location(self, location_str: str) -> tuple[str, str | None]:
94 """Parse a location string and extract the cleaned location and attendance type
95 :param location_str: Raw location string from the job posting
96 :return: Tuple of (cleaned location string, attendance_type string or None)"""
98 if not location_str:
99 return "", None
101 location_str = location_str.strip()
103 if not location_str:
104 return "", None
106 # Extract attendance type first
107 attendance_type = self.extract_attendance_type(location_str)
109 # Remove attendance indicators from location string
110 cleaned_location = self.remove_attendance_indicators(location_str)
112 return cleaned_location, attendance_type