import pandas as pd
import re
import time
import os
import shutil

# Record the start time
start_time = time.time()

# Define the base path
base_path = os.path.dirname(os.path.abspath(__file__))

# Define the data folder path
data_folder = os.path.join(base_path, 'data')

# Create data folder if it doesn't exist
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

# List of encodings to try
encodings = ['utf-8', 'latin1', 'iso-8859-1', 'windows-1252']

# Function to read the CSV file with different encodings
def read_csv_with_encoding(file_path):
    for encoding in encodings:
        try:
            return pd.read_csv(file_path, encoding=encoding)
        except UnicodeDecodeError:
            pass
    raise UnicodeDecodeError("Unable to decode the file with any of the specified encodings.")

# Read the CSV files
product_df = read_csv_with_encoding(os.path.join(base_path, "productinfo.csv"))
colors_df = pd.read_csv(os.path.join(data_folder, "colornames.csv"))
exclude_list_df = pd.read_csv(os.path.join(data_folder, "exclude-list.csv"))

# Function to find all color matches in the description
def find_colors(description):
    matched_colors = set()  # Use a set to store unique color names
    
    # Split the description by "/" to handle multi-colored entries
    color_terms = description.split('/')
    
    # Loop through each color term
    for term in color_terms:
        # Loop through color names and find all matches for each term
        for color in colors_df['name']:
            if color.lower() in term.lower():
                matched_colors.add(color)
    
    # If there are multiple colors matched, join them with "/"
    if len(matched_colors) > 1:
        return '/'.join(matched_colors)
    else:
        return ', '.join(matched_colors) if matched_colors else None

# Get the list of excluded keywords
excluded_keywords = exclude_list_df['keywords'].tolist()

# Apply the function to each description and create a new column with the matched colors
product_df['Matched_Colors'] = product_df['DESCRIPTION'].apply(find_colors)

# Replace None values with an empty string
product_df['Matched_Colors'].fillna('', inplace=True)

# Clone the Matched_Colors column into Color_Code and Color_Title
product_df['Color_Code'] = product_df['Matched_Colors']
product_df['Color_Title'] = product_df['Matched_Colors']

# Filter out rows containing excluded keywords
filtered_product_df = product_df[~product_df['Matched_Colors'].str.contains('|'.join(excluded_keywords), case=False)]

# Filter products that belong to both APPAREL and SWEATSHIRTS categories only
apparel_sweatshirts_df = filtered_product_df[filtered_product_df['CATEGORY_NAME1'] == 'APPAREL']
# Ensure 'PRODUCT' column exists in apparel_sweatshirts_df
if 'PRODUCT' not in apparel_sweatshirts_df.columns:
    raise KeyError("'PRODUCT' column not found in apparel_sweatshirts_df. Check column names or adjust code accordingly.")

# Merge apparel_sweatshirts_df with product_df on 'PRODUCT' column
apparel_sweatshirts_df = apparel_sweatshirts_df.merge(product_df[['PRODUCT', 'DESCRIPTION']], on='PRODUCT', how='left')

# Debugging print to check columns
print("Columns after merge:", apparel_sweatshirts_df.columns)

# Function to extract sizes from description
def extract_sizes(description):
    if pd.isna(description):
        return ''  # Return empty string for NaN values
    
    # Replace semicolons with dots to standardize the format
    description = description.replace(';', '.')
    
    # Check for "one size fits most" case
    one_size_match = re.search(r'\b(one|1)\s*size\b', description, re.IGNORECASE)
    if one_size_match:
        return 'one size'
    
    # Check for size ranges (e.g., "EU size 42-46")
    range_match = re.search(r'\b(?:EU|UK|US)?\s*size\s*(\d+-\d+)', description, re.IGNORECASE)
    if range_match:
        return range_match.group(1)
    
    # Check for "Size EU42 / US9" format
    eu_size_match = re.search(r'\bSize\s*EU(\d+)\s*/\s*US\d+(\.\d+)?\b', description, re.IGNORECASE)
    if eu_size_match:
        return eu_size_match.group(1)
    
    # Check for generic size (e.g., "Size" followed by any non-whitespace characters, including "/")
    size_match = re.search(r'(?:\bSize\b|\bSIZE\b|\bsize\b)\s*([\w/]+)', description)
    if size_match:
        return size_match.group(1)
    
    # Check for specific sizes
    specific_sizes = ['XXS', 'XS', 'S', 'M', 'L', 'XL', '2XL', '3XL', '4XL', 'XXL', 'XXXL', 'XXXXL']
    specific_sizes_pattern = '|'.join(specific_sizes)
    specific_sizes_match = re.search(r'\b(' + specific_sizes_pattern + r')\b', description)
    if specific_sizes_match:
        return specific_sizes_match.group(1)
    
    # Check for dimensions (e.g., "number x number x number mm" or "number x number x number cm" or "number x number inch")
    dimensions_match = re.search(r'\b(\d+(?:[.,]\d+)?)\s*[xX]\s*(\d+(?:[.,]\d+)?)\s*(?:[xX]\s*(\d+(?:[.,]\d+)?))?\s*(cm|mm|inch|in)\b', description)
    if dimensions_match:
        dimensions = f"{dimensions_match.group(1)} x {dimensions_match.group(2)}"
        if dimensions_match.group(3):
            dimensions += f" x {dimensions_match.group(3)}"
        return f"{dimensions} {dimensions_match.group(4)}"
    
    # Check for EU sizes and strip off the prefix
    eu_size_match = re.search(r'\bEU\s*size\s*(\d+)', description, re.IGNORECASE)
    if eu_size_match:
        return eu_size_match.group(1)
    
    # Check for single number case
    single_number_match = re.search(r'\b(\d{2,3})\b', description)
    if single_number_match:
        return single_number_match.group(1)
    
    return ''

# Extract sizes from 'DESCR_TYPE'
product_df['Sizes'] = product_df['DESCR_TYPE'].apply(extract_sizes)

# Clean the Sizes column by removing extra spaces
product_df['Sizes'] = product_df['Sizes'].astype(str).apply(lambda x: ' '.join(x.split()))

# Check if the ID CSV file exists
id_csv_path = os.path.join(data_folder, 'color_size_ids.csv')
if os.path.exists(id_csv_path):
    id_df = pd.read_csv(id_csv_path)
else:
    # Create an empty DataFrame with columns Type, Value, and ID
    id_df = pd.DataFrame(columns=['Type', 'Value', 'ID'])

# Function to assign IDs and update the ID DataFrame
def assign_ids(df, id_df, column_name, type_name):
    ids = []
    for value in df[column_name]:
        if value in id_df[id_df['Type'] == type_name]['Value'].values:
            ids.append(id_df[(id_df['Type'] == type_name) & (id_df['Value'] == value)]['ID'].values[0])
        else:
            new_id = f"{len(id_df) + 1:05d}"
            ids.append(new_id)
            new_row = pd.DataFrame({'Type': [type_name], 'Value': [value], 'ID': [new_id]})
            id_df = pd.concat([id_df, new_row], ignore_index=True)
    return ids, id_df

# Assign IDs to Color_Code and Sizes
product_df['Color_ID'], id_df = assign_ids(product_df[['Color_Code']], id_df, 'Color_Code', 'Color')
product_df['Size_ID'], id_df = assign_ids(product_df[['Sizes']], id_df, 'Sizes', 'Size')

# Save the updated ID DataFrame to CSV
id_df.to_csv(id_csv_path, index=False)

# Filter products that belong to both APPAREL and SWEATSHIRTS categories only
apparel_sweatshirts_df = product_df[product_df['CATEGORY_NAME1'] == 'APPAREL']

# Ensure 'DESCRIPTION_x' column exists before merging
if 'DESCRIPTION' not in apparel_sweatshirts_df.columns:
    raise KeyError("'DESCRIPTION' column not found in apparel_sweatshirts_df. Check column names or adjust code accordingly.")

# Use the correct column name for the description type
description_col = 'DESCRIPTION'

# Create a unique Parent_ID for each group of products with the same description
parent_ids = apparel_sweatshirts_df.groupby(description_col)['PRODUCT'].transform(lambda x: str(x.min()) + '1')
apparel_sweatshirts_df['Parent_ID'] = parent_ids

# Drop unnecessary columns
apparel_sweatshirts_df.drop(columns=['DESCRIPTION2', 'BRAND_CODE', 'BRAND_NAME', 'UM', 'CATEGORY_CODE1', 
                                      'CATEGORY_NAME1', 'CATEGORY_CODE2', 'CATEGORY_NAME2', 'CATEGORY_CODE3', 'CATEGORY_NAME3',
                                      'IMAGE_REFERENCE'], inplace=True)

# Write the filtered data to an Excel file
output_xlsx_path = os.path.join(base_path, 'apparel_data_with_colors_and_sizes.xlsx')
apparel_sweatshirts_df.to_excel(output_xlsx_path, sheet_name='Apparel Sweatshirts Data', index=False)

# Calculate the elapsed time
elapsed_time = time.time() - start_time

# Print the estimated time
print(f"Filtered apparel sweatshirts data saved in {output_xlsx_path}.")
print(f"Elapsed time: {elapsed_time:.2f} seconds")
