001/*
002# Licensed Materials - Property of IBM
003# Copyright IBM Corp. 2015  
004 */
005package twitter;
006
007import static com.ibm.streamsx.topology.file.FileStreams.directoryWatcher;
008import static com.ibm.streamsx.topology.file.FileStreams.textFileReader;
009
010import java.io.ObjectStreamException;
011import java.util.ArrayList;
012import java.util.List;
013import java.util.regex.Matcher;
014import java.util.regex.Pattern;
015
016import com.ibm.streamsx.topology.TStream;
017import com.ibm.streamsx.topology.Topology;
018import com.ibm.streamsx.topology.context.StreamsContextFactory;
019import com.ibm.streamsx.topology.function.Function;
020
021/**
022 * Sample twitter trending topology application. This Java application builds a 
023 * topology that reads from a file of tweets, extracts the hashtags from each
024 * line, and uses a window to keep track of the most popular hashtags from the
025 * past 40,000 tweets.
026 * 
027 * <br><br>
028 * 
029 * Although the application reads from a file, in principle it could be attached
030 * to a live data source.
031 * 
032 * <BR>
033 * <P>
034 * If no arguments are provided then the topology is executed in embedded mode,
035 * within this JVM.
036 * <BR>
037 * This may be executed from the {@code samples/java/functional} directory as:
038 * <UL>
039 * <LI>{@code ant run.twitter.trending} - Using Apache Ant, this will run in embedded
040 * mode and assumes tweets are in CSV files in {@code $HOME/tweets}.</LI>
041 * <LI>
042 * {@code java -cp functionalsamples.jar:../../../com.ibm.streamsx.topology/lib/com.ibm.streamsx.topology.jar:$STREAMS_INSTALL/lib/com.ibm.streams.operator.samples.jar
043 *  twitter.TwitterTrending CONTEXT_TYPE DIRECTORY
044 * } - Run directly from the command line.
045 * </LI>
046 * <i>CONTEXT_TYPE</i> is one of:
047 * <UL>
048 * <LI>{@code DISTRIBUTED} - Run as an IBM Streams distributed
049 * application.</LI>
050 * <LI>{@code STANDALONE} - Run as an IBM Streams standalone
051 * application.</LI>
052 * <LI>{@code EMBEDDED} - Run embedded within this JVM.</LI>
053 * <LI>{@code BUNDLE} - Create an IBM Streams application bundle.</LI>
054 * <LI>{@code TOOLKIT} - Create an IBM Streams application toolkit.</LI>
055 * </UL>
056 * and <i>DIRECTORY</i> is the location of a directory that contains one or more
057 * text files containing lines of tweets.
058 * </LI>
059 * <LI>
060 * An application execution within your IDE once you set the class path to include the correct jars.</LI>
061 * </UL>
062 * </P>
063 */
064public class TwitterTrending {
065    private static final Pattern TAG_PATTERN = Pattern
066            .compile("(?:^|\\s|[\\p{Punct}&&[^/]])(#[\\p{L}0-9-_]+)");
067
068    @SuppressWarnings("serial")
069    public static void main(String args[]) throws Exception {
070        if(args.length == 0){
071            throw new IllegalArgumentException("Must supply CONTEXT_TYPE and DIRECTORY as arguments");
072        }
073        String contextType = args[0];
074        String directory = args[1];
075
076        // Define the topology
077        Topology topology = new Topology("twitterPipeline");
078
079        // Stream containing file with tweets
080        TStream<String> files = directoryWatcher(topology, directory);
081
082        // Create a stream of lines from each file.
083        TStream<String> lines = textFileReader(files);
084
085        // Extract the hashtags from the string
086        TStream<String> hashtags = lines.multiTransform(
087                new Function<String, Iterable<String>>() {
088
089                    @Override
090                    public Iterable<String> apply(String v1) {
091                        ArrayList<String> tweetHashTags = new ArrayList<String>();
092                        matcher.reset(v1);
093                        while (matcher.find()) {
094                            tweetHashTags.add(matcher.group(1));
095                        }
096                        return tweetHashTags;
097                    }
098
099                    transient Matcher matcher;
100
101                    private Object readResolve() throws ObjectStreamException {
102                        matcher = TAG_PATTERN.matcher("");
103                        return this;
104                    }
105
106                });
107
108        // Extract the most frequent hashtags
109        TStream<List<HashTagCount>> hashTagMap = hashtags.last(40000).aggregate(
110                new Function<List<String>, List<HashTagCount>>() {
111
112                    @Override
113                    public List<HashTagCount> apply(List<String> v1) {
114                        Trender tre = new Trender();
115                        for (String s_iter : v1) {
116                            tre.add(s_iter);
117                        }
118                        return tre.getTopTen();
119                    }
120
121                });
122
123        hashTagMap.print();
124
125        StreamsContextFactory.getStreamsContext(contextType).submit(topology);
126    }
127}