001/*
002# Licensed Materials - Property of IBM
003# Copyright IBM Corp. 2015  
004 */
005package parallel;
006
007import static com.ibm.streamsx.topology.file.FileStreams.directoryWatcher;
008import static com.ibm.streamsx.topology.file.FileStreams.textFileReader;
009
010import java.io.ObjectStreamException;
011import java.util.logging.Logger;
012import java.util.regex.Matcher;
013import java.util.regex.Pattern;
014
015import com.ibm.streamsx.topology.TStream;
016import com.ibm.streamsx.topology.Topology;
017import com.ibm.streamsx.topology.context.StreamsContextFactory;
018import com.ibm.streamsx.topology.function.Function;
019import com.ibm.streamsx.topology.function.Predicate;
020
021public class ParallelRegexGrep {
022    static final Logger trace = Logger.getLogger("samples");
023
024    @SuppressWarnings("serial")
025    public static void main(String[] args) throws Exception {
026        String contextType = args[0];
027        String directory = args[1];
028        final Pattern pattern = Pattern.compile(args[2]);
029
030        // Define the topology
031        Topology topology = new Topology("ParallelRegexGrep");
032
033        // All streams with tuples that are Java String objects
034        TStream<String> files = directoryWatcher(topology, directory);
035
036        // Create a stream of lines from each file.
037        TStream<String> lines = textFileReader(files);
038
039        // Count the total number of lines before they are split between
040        // different parallel channels.
041        TStream<String> lines_counter = lines.transform(
042                new Function<String, String>() {
043
044                    private int numSentStrings = 0;
045
046                    @Override
047                    public String apply(String v1) {
048                        trace.info("Have sent " + (++numSentStrings)
049                                + "to be filtered.");
050                        return v1;
051                    }
052
053                });
054
055        // Parallelize the Stream.
056        // Since there are 5 channels of the stream, the approximate number of
057        // lines sent to each channel should be numSentStrings/5. This can be
058        // verified by comparing the outputs of the lines_counter stream to that
059        // of the parallel channels.
060        TStream<String> lines_parallel = lines_counter.parallel(5);
061
062        // Filter for the matched string, and print the number strings that have
063        // been tested. This is happening in parallel.
064        TStream<String> filtered_parallel = lines_parallel
065                .filter(new Predicate<String>() {
066
067                    private int numReceivedStrings = 0;
068
069                    @Override
070                    public boolean test(String v1) {
071                        trace.info("Have received " + (++numReceivedStrings)
072                                + "strings on this parallel channel.");
073                        // Pass the line through if it matches the
074                        // regular expression pattern
075                        return matcher.reset(v1).matches();
076                    }
077
078                    transient Matcher matcher;
079
080                    private Object readResolve() throws ObjectStreamException {
081                        matcher = pattern.matcher("");
082                        return this;
083                    }
084                });
085
086        // Join the results of each parallel filter into one stream,
087        // merging the parallel streams back into one stream.
088        TStream<String> filtered_condensed = filtered_parallel.endParallel();
089
090        // Print the combined results
091        filtered_condensed.print();
092
093        // Execute the topology
094        StreamsContextFactory.getStreamsContext(contextType).submit(topology);
095    }
096}