jim256 / ebay

0 stars 0 forks source link

Quote for additional data cleansing / processing / normalization #16

Closed jim256 closed 4 years ago

jim256 commented 4 years ago

I meant to include this in the original project details, but neglected to, partially because I assumed eBay would have cleaner / better data.

This is Python 2.7 code that all of my other spiders use just before they write to the db.

Would you mind giving me an idea for how much additional it would be to add something like this to the project?

`# custom setters @transmission.setter def transmission(self, transmission): if bool(transmission): self._transmission = transmission.title()

@mileage.setter
def mileage(self, mileage):
    try:
        if int(mileage) < 300 and int(self.year) < datetime.utcnow().year - 1:
            mileage = int(mileage) * 1000
        elif int(mileage) > 1000000:
            while int(mileage) > 1000000:
                mileage = int(mileage) / 10
    except:
        pass

    self._mileage = mileage

@drive_type.setter
def drive_type(self, drive_type):
    if bool(drive_type):
        drive_type = drive_type.lower()

        if bool(self.body_type):
            body_type = self.body_type.lower()

        if 'awd' in drive_type or 'fwd' in drive_type or 'rwd' in drive_type:
            drive_type = drive_type.upper()
        elif '4' in drive_type:
            drive_type = '4WD'
        elif 'all' in drive_type:
            drive_type = 'AWD'
        elif 'front' in drive_type:
            drive_type = 'FWD'
        elif 'rear' in drive_type:
            drive_type = 'RWD'
        elif '2' in drive_type:
            if bool(self.body_type):
                if 'truck' in body_type or 'pickup' in body_type or 'coupe' in body_type or 'convertible' in body_type:
                    drive_type = 'RWD'
                else:
                    drive_type = 'FWD'
            else:
                drive_type = '2WD'

        self._drive_type = drive_type

@body_type.setter
def body_type(self, body_type):
    if bool(body_type):
        body_type = body_type.lower()

        if 'sport utility' in body_type or 'cross' in body_type or 'suv' in body_type:
            body_type = 'SUV'
        elif 'truck' in body_type or 'pickup' in body_type:
            body_type = 'Truck'
        elif 'sedan' in body_type or 'compact' in body_type:
            body_type = 'Sedan'
        elif 'coupe' in body_type:
            body_type = 'Coupe'
        elif 'convertible' in body_type:
            body_type = 'Convertible'
        elif 'van' in body_type:
            body_type = 'Van'
        elif 'hatch' in body_type or 'wagon' in body_type:
            body_type = 'Hatchback'

        self._body_type = body_type.title()

@fuel_type.setter
def fuel_type(self, fuel_type):
    if bool(fuel_type):
        fuel_type = fuel_type.lower()

        if 'diesel' in fuel_type:
            fuel_type = 'Diesel'
        elif 'gas' in fuel_type:
            fuel_type = 'Gas'
        else:
            fuel_type = 'Other'

        self._fuel_type = fuel_type

@title_type.setter
def title_type(self, title_type):
    if bool(title_type):
        title_type = title_type.lower()

        if 'clean' in title_type or 'clear' in title_type:
            title_type = 'Clean'
        else:
            title_type = 'Salvage'

        self._title_type = title_type

# ensure int values
@validates('price', 'year', '_mileage', 'num_cylinders', 'num_doors')
def validate_ints(self, key, value):
    if isinstance(value, float):
        return int(value)
    if not isinstance(value, (int, long)):
        try:
            return int(filter(lambda x: x.isdigit(), value))
        except:
            return None

    return value`
james-carpenter commented 4 years ago

I'm happy to add these cleansing rules. Regarding cost estimate, I'll agree to whatever you think is fair.

james-carpenter commented 4 years ago

Based on reviewing those rules and profiling about 10,000 records, I think there are a few suggestions I would have to eliminate some false positives and pickup some that may have been missed.

james-carpenter commented 4 years ago

Added these normalization rules.