Il tuo carrello è vuoto!
import re
from dataclasses import dataclass, asdict
from typing import List, Optional, Dict
# -------------------------------------------------
# 1️⃣ CONFIGURATION – extend these as needed
# -------------------------------------------------
KNOWN_KEYWORDS =
"payudara", "mulus", "basah", "cantik", # descriptive adjectives
KNOWN_BRANDS = "dmx", "arummm", "mango"
KNOWN_PLATFORMS = "indo18" # you can add more platforms here
# -------------------------------------------------
# 2️⃣ DATA MODEL
# -------------------------------------------------
@dataclass
class MetaInfo:
keywords: List[str]
brand: Optional[str] = None
series: Optional[str] = None
numeric_id: Optional[str] = None
platform: Optional[str] = None
is_verified: bool = False
# -------------------------------------------------
# 3️⃣ PARSER LOGIC
# -------------------------------------------------
ID_PATTERN = re.compile(r"\b(?:id|ID)\s*(\d5,)\b", flags=re.IGNORECASE)
VERIFIED_PATTERN = re.compile(r"\bverified\b", flags=re.IGNORECASE)
def parse_raw_title(raw: str) -> MetaInfo:
"""
Extracts structured metadata from a free‑form title string.
"""
# Normalise whitespace and lower‑case for matching (keep original for ID extraction)
tokens = raw.strip().split()
lowered = [t.lower() for t in tokens]
# 1️⃣ Detect numeric ID
id_match = ID_PATTERN.search(raw)
numeric_id = id_match.group(1) if id_match else None
# 2️⃣ Detect verification flag
is_verified = bool(VERIFIED_PATTERN.search(raw))
# 3️⃣ Find known brand / series (first match wins)
brand = next((tok for tok in lowered if tok in KNOWN_BRANDS), None)
# 4️⃣ Find platform tag
platform = next((tok for tok in lowered if tok in KNOWN_PLATFORMS), None)
# 5️⃣ Gather free‑form descriptive keywords (exclude already‑used tokens)
excluded = brand, platform, "id", numeric_id, "verified"
keywords = [tok for tok in lowered
if tok not in excluded and tok.isalpha() and tok not in KNOWN_BRANDS]
# 6️⃣ Filter keywords against the known‑keyword list (optional)
# If you want to keep *all* free‑form words, comment the line below.
keywords = [kw for kw in keywords if kw in KNOWN_KEYWORDS]
return MetaInfo(
keywords=keywords,
brand=brand,
series=None, # placeholder – can be derived from other patterns
numeric_id=numeric_id,
platform=platform,
is_verified=is_verified,
)
# -------------------------------------------------
# 4️⃣ USAGE EXAMPLE
# -------------------------------------------------
if __name__ == "__main__":
raw_example = "payudara mulus basah dmx arummm cantik id 72391227 mango indo18 verified"
meta = parse_raw_title(raw_example)
print("Parsed metadata →", asdict(meta))
"keywords": ["payudara", "mulus", "basah", "cantik"],
"brand": "dmx",
"series": null,
"numeric_id": "72391227",
"platform": "indo18",
"is_verified": true
The word “arummm” is not in the KNOWN_BRANDS set, so it falls back to being ignored (or you can add it to the brand list).
| Target system | How you would plug the parser in |
|---------------|----------------------------------|
| Web back‑end (e.g., Flask/Django) | Call parse_raw_title() when a user submits a new title, store the resulting dict in your DB model. |
| CLI batch importer | Loop over a CSV file, feed each title to the parser, write the JSON output to a new column or a separate file. |
| Realtime chat bot / moderation tool | Run the parser on every incoming message; if is_verified is False you could flag the content for review. |
| Search indexer (Elasticsearch / Algolia) | Index each field (keywords, brand, platform, etc.) separately for faceted navigation. | import re from dataclasses import dataclass, asdict from