A review example: Titanic survivors

require(arules)
## Loading required package: arules
## Loading required package: Matrix
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
require(effects)
## Loading required package: effects
#data(TitanicSurvival)
mydata <- data.frame(TitanicSurvival$sex,TitanicSurvival$survived)
rules <- apriori(mydata, 
    parameter = list(supp = 0.01, conf = 0.01, target = "rules", minlen=2, maxlen=2))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.01    0.1    1 none FALSE            TRUE       5    0.01      2
##  maxlen target   ext
##       2  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 13 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4 item(s), 1309 transaction(s)] done [0.00s].
## sorting and recoding items ... [4 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [8 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# use for further inspection, if desired
#summary(rules)
#inspect(rules)
#The following line adds "leverage" to the rules quality data, qhich is not calculated by default
quality(rules)<-interestMeasure(rules,measure=c("support","confidence","leverage","lift"),mydata)
inspect(rules)
##     lhs                               rhs                               support confidence   leverage      lift
## [1] {TitanicSurvival.sex=female}   => {TitanicSurvival.survived=yes} 0.25897632  0.7274678  0.1229958 1.9045107
## [2] {TitanicSurvival.survived=yes} => {TitanicSurvival.sex=female}   0.25897632  0.6780000  0.1229958 1.9045107
## [3] {TitanicSurvival.sex=female}   => {TitanicSurvival.survived=no}  0.09702063  0.2725322 -0.1229958 0.4409699
## [4] {TitanicSurvival.survived=no}  => {TitanicSurvival.sex=female}   0.09702063  0.1569839 -0.1229958 0.4409699
## [5] {TitanicSurvival.survived=yes} => {TitanicSurvival.sex=male}     0.12299465  0.3220000 -0.1229958 0.4999976
## [6] {TitanicSurvival.sex=male}     => {TitanicSurvival.survived=yes} 0.12299465  0.1909846 -0.1229958 0.4999976
## [7] {TitanicSurvival.survived=no}  => {TitanicSurvival.sex=male}     0.52100840  0.8430161  0.1229958 1.3090250
## [8] {TitanicSurvival.sex=male}     => {TitanicSurvival.survived=no}  0.52100840  0.8090154  0.1229958 1.3090250

To visualize rules:

library(arulesViz)
## Loading required package: grid
plot(rules) 

plot(rules, method="graph") 

** Exercises **

  1. Which of the rules you would consider as measningful, i.e. indicate a not-by-change co-occurrence? Explain.
  2. Add a variable “AgeGroup” which is set to “Child” if a person’s age is less than or equal to 16, and “Adult” otherwise. Repeat the association rule mining, comment on the results.

Market basket analysis revisited:

Load the groceries data that comes with arules package

require(arules)
data(Groceries)
dim(Groceries)
## [1] 9835  169
inspect(head(Groceries, 3))
##     items                
## [1] {citrus fruit,       
##      semi-finished bread,
##      margarine,          
##      ready soups}        
## [2] {tropical fruit,     
##      yogurt,             
##      coffee}             
## [3] {whole milk}
itemFrequencyPlot(Groceries, topN=20, type='absolute')

Find the rules:

rules <- apriori (Groceries, parameter = list(supp = 0.01, conf = 0.5,minlen=2))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5    0.01      2
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 98 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [88 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [15 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
quality(rules)<-interestMeasure(rules,measure=c("support","confidence","leverage","lift"),mydata)
inspect(rules)
##      lhs                     rhs                   support confidence    leverage     lift
## [1]  {curd,                                                                               
##       yogurt}             => {whole milk}       0.01006609  0.5823529 0.005649444 2.279125
## [2]  {other vegetables,                                                                   
##       butter}             => {whole milk}       0.01148958  0.5736041 0.006371464 2.244885
## [3]  {other vegetables,                                                                   
##       domestic eggs}      => {whole milk}       0.01230300  0.5525114 0.006613319 2.162336
## [4]  {yogurt,                                                                             
##       whipped/sour cream} => {whole milk}       0.01087951  0.5245098 0.005579536 2.052747
## [5]  {other vegetables,                                                                   
##       whipped/sour cream} => {whole milk}       0.01464159  0.5070423 0.007263188 1.984385
## [6]  {pip fruit,                                                                          
##       other vegetables}   => {whole milk}       0.01352313  0.5175097 0.006846201 2.025351
## [7]  {citrus fruit,                                                                       
##       root vegetables}    => {other vegetables} 0.01037112  0.5862069 0.006947868 3.029608
## [8]  {tropical fruit,                                                                     
##       root vegetables}    => {other vegetables} 0.01230300  0.5845411 0.008230506 3.020999
## [9]  {tropical fruit,                                                                     
##       root vegetables}    => {whole milk}       0.01199797  0.5700483 0.006620049 2.230969
## [10] {tropical fruit,                                                                     
##       yogurt}             => {whole milk}       0.01514997  0.5173611 0.007667655 2.024770
## [11] {root vegetables,                                                                    
##       yogurt}             => {other vegetables} 0.01291307  0.5000000 0.007915900 2.584078
## [12] {root vegetables,                                                                    
##       yogurt}             => {whole milk}       0.01453991  0.5629921 0.007940918 2.203354
## [13] {root vegetables,                                                                    
##       rolls/buns}         => {other vegetables} 0.01220132  0.5020921 0.007499264 2.594890
## [14] {root vegetables,                                                                    
##       rolls/buns}         => {whole milk}       0.01270971  0.5230126 0.006500424 2.046888
## [15] {other vegetables,                                                                   
##       yogurt}             => {whole milk}       0.02226741  0.5128806 0.011173834 2.007235

Exercises 1. Which rule indicates an associaiton which is ‘most surprising’? 2. Which rule is most interesting from a marketing perspective?

Tutorial Exercise

The data provided for this exercise is provided at the course website. The data is created by using the tutorial and dataset below: https://datascienceplus.com/a-gentle-introduction-on-market-basket-analysis%E2%80%8A-%E2%80%8Aassociation-rules/ which uses https://archive.ics.uci.edu/ml/datasets/Online+Retail

Load the data

transactions <- read.transactions('market_basket.csv', format = 'basket', sep=',')
summary(transactions)
## transactions as itemMatrix in sparse format with
##  19297 rows (elements/itemsets/transactions) and
##  27165 columns (items) and a density of 0.0006701659 
## 
## most frequent items:
## WHITE HANGING HEART T-LIGHT HOLDER           REGENCY CAKESTAND 3 TIER 
##                               1758                               1660 
##            JUMBO BAG RED RETROSPOT                      PARTY BUNTING 
##                               1434                               1271 
##      ASSORTED COLOUR BIRD ORNAMENT                            (Other) 
##                               1237                             343943 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
##    1 2263 1189  851  768  725  662  618  597  582  554  572  506  487  508 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30 
##  504  503  449  413  477  420  383  304  313  270  237  253  223  204  222 
##   31   32   33   34   35   36   37   38   39   40   41   42   43   44   45 
##  216  171  147  138  147  130  111  116   89  104   96   92   85   94   61 
##   46   47   48   49   50   51   52   53   54   55   56   57   58   59   60 
##   67   73   67   64   52   49   59   50   41   53   50   35   24   40   35 
##   61   62   63   64   65   66   67   68   69   70   71   72   73   74   75 
##   29   27   23   21   21   17   27   31   24   16   24   18   19   18   13 
##   76   77   78   79   80   81   82   83   84   85   86   87   88   89   90 
##   14   17   14    7    9   18   17   11   10    8   13   10   14    6    7 
##   91   92   93   94   95   96   97   98   99  100  101  102  103  104  105 
##    9    6    7    8    5    4    5    5    3    3    3    4    5    5    2 
##  106  107  108  109  110  111  112  113  114  115  116  117  118  119  120 
##    3    3    7    4    6    3    4    1    2    2    1    3    4    3    1 
##  121  122  123  124  126  127  128  132  133  134  135  140  141  142  143 
##    2    1    3    2    4    1    1    1    1    3    1    1    1    1    2 
##  144  146  147  148  150  151  155  158  162  167  169  172  178  179  181 
##    1    1    3    1    1    1    2    2    1    1    1    2    1    1    1 
##  199  200  203  205  206  210  230  237  250  251  287  322  402  421 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    5.00   13.00   18.21   24.00  421.00 
## 
## includes extended item information - examples:
##   labels
## 1      1
## 2     10
## 3    100

Do the following exercises:

  1. Plot the item frequencies
  2. Find the association rules. Include the leverage in rule quality.
  3. What is the most surprising association?
  4. Which association you think is most insteresting from marketing perspective?