001/* 002# Licensed Materials - Property of IBM 003# Copyright IBM Corp. 2015 004 */ 005package twitter; 006 007import static com.ibm.streamsx.topology.file.FileStreams.directoryWatcher; 008import static com.ibm.streamsx.topology.file.FileStreams.textFileReader; 009 010import java.io.ObjectStreamException; 011import java.util.ArrayList; 012import java.util.List; 013import java.util.regex.Matcher; 014import java.util.regex.Pattern; 015 016import com.ibm.streamsx.topology.TStream; 017import com.ibm.streamsx.topology.Topology; 018import com.ibm.streamsx.topology.context.StreamsContextFactory; 019import com.ibm.streamsx.topology.function.Function; 020 021/** 022 * Sample twitter trending topology application. This Java application builds a 023 * topology that reads from a file of tweets, extracts the hashtags from each 024 * line, and uses a window to keep track of the most popular hashtags from the 025 * past 40,000 tweets. 026 * 027 * <br><br> 028 * 029 * Although the application reads from a file, in principle it could be attached 030 * to a live data source. 031 * 032 * <BR> 033 * <P> 034 * If no arguments are provided then the topology is executed in embedded mode, 035 * within this JVM. 036 * <BR> 037 * This may be executed from the {@code samples/java/functional} directory as: 038 * <UL> 039 * <LI>{@code ant run.twitter.trending} - Using Apache Ant, this will run in embedded 040 * mode and assumes tweets are in CSV files in {@code $HOME/tweets}.</LI> 041 * <LI> 042 * {@code java -cp functionalsamples.jar:../../../com.ibm.streamsx.topology/lib/com.ibm.streamsx.topology.jar:$STREAMS_INSTALL/lib/com.ibm.streams.operator.samples.jar 043 * twitter.TwitterTrending CONTEXT_TYPE DIRECTORY 044 * } - Run directly from the command line. 045 * </LI> 046 * <i>CONTEXT_TYPE</i> is one of: 047 * <UL> 048 * <LI>{@code DISTRIBUTED} - Run as an IBM Streams distributed 049 * application.</LI> 050 * <LI>{@code STANDALONE} - Run as an IBM Streams standalone 051 * application.</LI> 052 * <LI>{@code EMBEDDED} - Run embedded within this JVM.</LI> 053 * <LI>{@code BUNDLE} - Create an IBM Streams application bundle.</LI> 054 * <LI>{@code TOOLKIT} - Create an IBM Streams application toolkit.</LI> 055 * </UL> 056 * and <i>DIRECTORY</i> is the location of a directory that contains one or more 057 * text files containing lines of tweets. 058 * </LI> 059 * <LI> 060 * An application execution within your IDE once you set the class path to include the correct jars.</LI> 061 * </UL> 062 * </P> 063 */ 064public class TwitterTrending { 065 private static final Pattern TAG_PATTERN = Pattern 066 .compile("(?:^|\\s|[\\p{Punct}&&[^/]])(#[\\p{L}0-9-_]+)"); 067 068 @SuppressWarnings("serial") 069 public static void main(String args[]) throws Exception { 070 if(args.length == 0){ 071 throw new IllegalArgumentException("Must supply CONTEXT_TYPE and DIRECTORY as arguments"); 072 } 073 String contextType = args[0]; 074 String directory = args[1]; 075 076 // Define the topology 077 Topology topology = new Topology("twitterPipeline"); 078 079 // Stream containing file with tweets 080 TStream<String> files = directoryWatcher(topology, directory); 081 082 // Create a stream of lines from each file. 083 TStream<String> lines = textFileReader(files); 084 085 // Extract the hashtags from the string 086 TStream<String> hashtags = lines.multiTransform( 087 new Function<String, Iterable<String>>() { 088 089 @Override 090 public Iterable<String> apply(String v1) { 091 ArrayList<String> tweetHashTags = new ArrayList<String>(); 092 matcher.reset(v1); 093 while (matcher.find()) { 094 tweetHashTags.add(matcher.group(1)); 095 } 096 return tweetHashTags; 097 } 098 099 transient Matcher matcher; 100 101 private Object readResolve() throws ObjectStreamException { 102 matcher = TAG_PATTERN.matcher(""); 103 return this; 104 } 105 106 }); 107 108 // Extract the most frequent hashtags 109 TStream<List<HashTagCount>> hashTagMap = hashtags.last(40000).aggregate( 110 new Function<List<String>, List<HashTagCount>>() { 111 112 @Override 113 public List<HashTagCount> apply(List<String> v1) { 114 Trender tre = new Trender(); 115 for (String s_iter : v1) { 116 tre.add(s_iter); 117 } 118 return tre.getTopTen(); 119 } 120 121 }); 122 123 hashTagMap.print(); 124 125 StreamsContextFactory.getStreamsContext(contextType).submit(topology); 126 } 127}