001/* 002# Licensed Materials - Property of IBM 003# Copyright IBM Corp. 2015 004 */ 005package parallel; 006 007import static com.ibm.streamsx.topology.file.FileStreams.directoryWatcher; 008import static com.ibm.streamsx.topology.file.FileStreams.textFileReader; 009import static com.ibm.streamsx.topology.logic.Value.of; 010 011import java.io.ObjectStreamException; 012import java.util.logging.Logger; 013import java.util.regex.Matcher; 014import java.util.regex.Pattern; 015 016import com.ibm.streamsx.topology.TStream; 017import com.ibm.streamsx.topology.Topology; 018import com.ibm.streamsx.topology.context.StreamsContextFactory; 019import com.ibm.streamsx.topology.function.Predicate; 020 021/** 022 * PartitionedParallelRegexGrep is like ParallelRegexGrep, except that the Java 023 * object in the tuple being passed into the parallel region implements the 024 * Keyable interface, and provides a getKey() function which is used to map 025 * tuples to their corresponding channel in the parallel region. 026 * 027 * Each channel of the parallel region only receives tuples that have the same 028 * hashCode() value of the Key returned by the tuple value's getKey() method. In 029 * other words, for each tuple, the value returned by 030 * 031 * tupleValue.getKey().hashCode() 032 * 033 * will go to the same channel, for each tuple which returns that result. To 034 * show this, instead of passing a java.lang.String into the parallel region, a 035 * stringWrapper class is created that implements the Keyable interface. 036 * 037 * For this sample, if you read from a file that contains the following: 038 * 039 * Apple Orange Banana Banana Apple Apple 040 * 041 * you notice that the lines containing Apple will always be sent to the same 042 * channel of the parallel region; the same for the lines containing Orange and 043 * Banana. 044 * 045 * 046 */ 047public class PartitionedParallelRegexGrep { 048 static final Logger trace = Logger 049 .getLogger("samples.partitionedparallelregexgrep"); 050 051 @SuppressWarnings("serial") 052 public static void main(String[] args) throws Exception { 053 String contextType = args[0]; 054 String directory = args[1]; 055 final Pattern pattern = Pattern.compile(args[2]); 056 057 // Define the topology 058 Topology topology = new Topology("PartitionedParallelRegexGrep"); 059 060 // All streams with tuples that are Java String objects 061 TStream<String> files = directoryWatcher(topology, directory); 062 TStream<String> lines = textFileReader(files); 063 064 // Begin parallel region 065 TStream<String> parallelLines = lines 066 .parallel(of(5), TStream.Routing.HASH_PARTITIONED); 067 TStream<String> ParallelFiltered = parallelLines 068 .filter(new Predicate<String>() { 069 070 @Override 071 public boolean test(String v1) { 072 // If you inspect the output of the streams in this 073 // parallel 074 // region, you will see that any string that is sent to 075 // one 076 // channel will not be sent to another. In other words, 077 // if you 078 // see "apple" being sent to this channel, you will 079 // never see 080 // "apple" being sent to any other channel. 081 trace.info("Testing string \"" + v1 082 + "\" for the pattern."); 083 // Pass the line through if it matches the 084 // regular expression pattern 085 return matcher.reset(v1).matches(); 086 } 087 088 transient Matcher matcher; 089 090 private Object readResolve() throws ObjectStreamException { 091 matcher = pattern.matcher(""); 092 return this; 093 } 094 }); 095 096 // Combine the results of each parallel filter into one stream, ending 097 // the parallel region. 098 TStream<String> filtered_condensed = ParallelFiltered 099 .endParallel(); 100 101 // Print the combined results 102 filtered_condensed.print(); 103 104 // Execute the topology 105 StreamsContextFactory.getStreamsContext(contextType).submit(topology); 106 } 107}