## K-Means Ingredient Clustering
Created by Aidan Boyne - see the [docs](https://github.com/aidanboyne/recipes).

In [48]:
import requests as rq
from lxml import html
import pandas as pd
import numpy as np
import itertools
from tqdm import tqdm
import plotly.express as px
import random


## Functionized scraper and parser for looping
 - Input: pageNo (from loop)
 - Output:
    - If valid page --> `valid`: True (bool), `complist`: parsed ingredients (list), `rating`: AR rating (int)
    - If invalid --> `valid`: False (bool), `pageNo`: invalid page (int), `ratefail`: (bool)

In [29]:
def ar_scraper(pageNo):

    #import master ingredient list
    global ingredientSet
    #Page access URL
    URL = "https://www.allrecipes.com/recipe/"+str(pageNo)+"/"
    page = rq.get(URL)
    tree = html.fromstring(page.content)

    #Content Xpath mappings
    count2Path = "/html/body/div[3]/div/main/div[1]/div[2]/div[1]/div[1]/div[2]/div[2]/ul/li[1]/a"
    titlePath = "/html/body/div[3]/div/main/div[1]/div[2]/div[1]/div[1]/div[1]/div/h1"
    ratingPath = "/html/body/div[3]/div/main/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/span"

    #Data QC
    valid = True #check if page contains real information
    divno = 5 #flexible checker for different page layouts

    #invalid recipie no
    ratecount_str = tree.xpath(str(count2Path+"/text()"))

    if not ratecount_str: #change if needed (i.e. < 5...)
        #print("Page does not exist, skipping...")
        valid = False
        fail_type = "page not existing"
        #for main: if valid=False, add pageNo to pageOmitList
        return valid, pageNo, fail_type
    else:
        if ratecount_str[0] == ' Be the first to rate & review! ':
            ratecount = 0
        else:
            ratecount = int(''.join(x for x in ratecount_str[0] if x.isdigit()))
        #skips pages with no reviews as QC assurance, can change to vary confidence strictness
        if ratecount == 0: #change if needed (i.e. < 5...)
            if valid:
                #print("Page does not have enough reviews to be included in dataset, skipping...")
                valid = False
                fail_type = "insufficient reviews"
                #for main: if valid=False, add pageNo to pageOmitList
                return valid, pageNo, fail_type
        else:
            try:
                #rating getter
                rating = float(tree.xpath(str(ratingPath+"/text()"))[0])
                #ingredient getter loop
                listing = 1
                ingno = 1
                first  = 1
                ingredients = []
                while listing:
                    ingPath = "/html/body/div[3]/div/main/div[1]/div[2]/div[1]/div[2]/div[2]/div["+str(divno)+"]/section[1]/fieldset/ul/li["+str(ingno)+"]/label/span/span"
                    newing = tree.xpath(str(ingPath+"/text()"))
                    if not newing: #following checks for the two types of ingredient divs and switches to appropriate, also ends loop when no more ingredients
                        if not ingredients:
                            divno = 6
                        else:
                            listing = 0
                    else:
                        ingno = ingno + 1
                        ingredients.append(newing[0])

                #---------------------------------------COMMENT OUT IN IMPLEMENTATION-----------------------------------------------
                # #direction getter loop (not needed for graph analysis)
                # listing = 1
                # dirno = 1
                # directions = []
                # while listing:
                #     directionPath = "/html/body/div[3]/div/main/div[1]/div[2]/div[1]/div[2]/div[2]/section[1]/fieldset/ul/li["+str(dirno)+"]/div[1]/div/p"
                #     newdir = tree.xpath(str(directionPath+"/text()"))
                #     dirno = dirno + 1
                #     if newdir == []:
                #         listing = 0;
                #     else:
                #         directions.append(newdir[0])        
                # #Other variable assignments
                # title = tree.xpath(str(titlePath+"/text()"))[0]
                #---------------------------------------END COMMENT OUT-------------------------------------------------------------
            except:
                valid = False
                fail_type = "invalid page format"
                return valid, pageNo, fail_type

    if valid:

        #---------------------------------------COMMENT OUT IN IMPLEMENTATION-----------------------------------------------
        # #Formatted display of title, rating, ingredients, and directions objects extracted

        # print(title+"\n-----------------------------")
        # print("Rating: "+str(rating)+"\n\nIngredients \n")
        # for ingredient in ingredients:
        #     print(" - "+ ingredient)
        # print("\nDirections")
        # for direction in directions:
        #     print(" - "+ direction)
        #---------------------------------------END COMMENT OUT-------------------------------------------------------------

        # Checking each scraped ingredient one by one to see if it has word(s) containing master ingredient
        # Longer whole-word substrings preferred to eliminate partial duplication and reduce lopsdd
        complist=[]
        extractIngredients = pd.DataFrame(ingredients, columns=['Extracted Ingredients']) #ingredients scraped
        for setIngredient in ingredientSet['Ingredient']: #for each ingredient in the master list
            regex_str = "\\b" + setIngredient + "\\b"
            check = extractIngredients['Extracted Ingredients'].str.contains(pat=regex_str, case='False', na='False') #do ingredients scraped contain substring matching master
            if any(check):
                if setIngredient not in complist:
                    complist.append(setIngredient)
        complist = sorted([*set(complist)],key=len,reverse=True) #remove any duplicates due to substring matching and sort from longest to shortest

        out = []
        for s in complist:
            if not any([s in o for o in out]):
                out.append(s)
        complist = out
        return valid, complist, rating, ratecount

### TODO: Combination Architecture
- (DONE) Create list of all possible ingredient combinations from scraped ingredients
- (DONE) Create temporary dataframe from list to hold these results
- (DONE) Concatenate temp dataframe to master dataframe containing previous combos and ratings
- (DONE) Change if/else loops to try/except to catch wider unavailibility
- (DONE) Every 100 recipies, aggregate identical ingredient combinations in master dataframe by rating
- (DONE) Add weighting based on number of ratings to aggregation system
- Make API for website to access text file of allrecipies
- Find better weighting function

In [77]:
#Page Selection
page_start = 258468
page_end = 258568

#Initializations
omitted_pages = []
masterframe = pd.DataFrame()
prog_range = tqdm(range(page_start,page_end))


#Test Loop
for pageNumber in prog_range:
    scraped=ar_scraper(pageNumber)
    if scraped[0]:
        combolist=list(itertools.combinations(scraped[1],2))
        comboframe = pd.DataFrame(list(itertools.combinations(scraped[1], 2)), columns=['Ingredient 1','Ingredient 2'])
        comboframe['Rating']= ([scraped[2]]*len(comboframe))
        comboframe['Weight']= ([scraped[3]]*len(comboframe))
        comboframe['Score'] = 100*comboframe.Rating+comboframe.Weight#Bad weighting function, fine-tune this later
        masterframe = pd.concat([masterframe, comboframe], axis=0, ignore_index=True)
    else:
        omitted_pages.append(str(scraped[1])+": "+scraped[2])

    #Aggregation every 100 entries
    if pageNumber%100 == 0:
        masterframe.reindex()
        masterframe.groupby(by=['Ingredient 1', 'Ingredient 2'])['Rating', 'Weight'].mean()

    prog_range.set_description(f'Processing page {pageNumber}')

#Noiseing Data for visualization
randlist = []
for i in range(0,len(masterframe['Score'])):
    randlist.append((random.randrange(-200, 200))/10)
randcol = pd.DataFrame(randlist)
masterframe['Score'] = masterframe['Score'].add(randcol[0])

#Final aggregation and sorting before analysis
masterframe.groupby(by=['Ingredient 1', 'Ingredient 2'])['Rating', 'Weight'].mean()
masterframe.sort_values(by=['Score'], ascending=False)
masterframe.reindex()

print("Omitted Pages: ")
print(omitted_pages)
print("Non-aggregated Ingredient List")
print(masterframe)


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.

Processing page 258567: 100%|██████████| 100/100 [06:06<00:00,  3.67s/it]

Omitted Pages: 
['258470: insufficient reviews', '258477: insufficient reviews', '258478: insufficient reviews', '258489: insufficient reviews', '258497: invalid page format', '258498: insufficient reviews', '258499: invalid page format', '258500: page not existing', '258501: page not existing', '258502: page not existing', '258503: page not existing', '258504: page not existing', '258505: page not existing', '258506: page not existing', '258507: page not existing', '258508: page not existing', '258509: page not existing', '258510: page not existing', '258511: page not existing', '258512: page not existing', '258513: page not existing', '258514: page not existing', '258515: page not existing', '258516: page not existing', '258517: page not existing', '258518: page not existing', '258519: page not existing', '258520: page not existing', '258521: page not existing', '258522: page not existing', '258523: page not existing', '258524: page not existing', '258525: page not existing', '258526



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



In [78]:
fig = px.scatter(masterframe, x="Rating", y="Score", size = "Rating", color="Ingredient 1", hover_data=['Ingredient 1', 'Ingredient 2'])
fig.show()