View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.video;
18  
19  import java.io.ByteArrayInputStream;
20  import java.io.DataInputStream;
21  import java.io.IOException;
22  import java.io.InputStream;
23  import java.util.ArrayList;
24  import java.util.Collections;
25  import java.util.Date;
26  import java.util.HashMap;
27  import java.util.Map;
28  import java.util.Set;
29  import java.util.Map.Entry;
30  
31  import org.apache.tika.exception.TikaException;
32  import org.apache.tika.metadata.Metadata;
33  import org.apache.tika.mime.MediaType;
34  import org.apache.tika.parser.ParseContext;
35  import org.apache.tika.parser.Parser;
36  import org.apache.tika.sax.XHTMLContentHandler;
37  import org.xml.sax.ContentHandler;
38  import org.xml.sax.SAXException;
39  
40  /**
41   * <p>
42   * Parser for metadata contained in Flash Videos (.flv). Resources:
43   * http://osflash.org/flv and for AMF:
44   * http://download.macromedia.com/pub/labs/amf/amf0_spec_121207.pdf
45   * <p>
46   * This parser is capable of extracting the general metadata from header as well
47   * as embedded metadata.
48   * <p>
49   * Known keys for metadata (from file header):
50   * <ol>
51   * <li>hasVideo: true|false
52   * <li>hasSound: true|false
53   * </ol>
54   * <p>
55   * In addition to the above values also metadata that is inserted in to the
56   * actual stream will be picked. Usually there are keys like:
57   * hasKeyframes, lastkeyframetimestamp, audiocodecid, keyframes, filepositions,
58   * hasMetadata, audiosamplerate, videodatarate metadatadate, videocodecid,
59   * metadatacreator, audiosize, hasVideo, height, audiosamplesize, framerate,
60   * hasCuePoints width, cuePoints, lasttimestamp, canSeekToEnd, datasize,
61   * duration, videosize, filesize, audiodatarate, hasAudio, stereo audiodelay
62   */
63  public class FLVParser implements Parser {
64  
65      private static int TYPE_METADATA = 0x12;
66      private static byte MASK_AUDIO = 1;
67      private static byte MASK_VIDEO = 4;
68  
69      private static final Set<MediaType> SUPPORTED_TYPES =
70          Collections.singleton(MediaType.video("x-flv"));
71  
72      public Set<MediaType> getSupportedTypes(ParseContext context) {
73          return SUPPORTED_TYPES;
74      }
75  
76      private long readUInt32(DataInputStream input) throws IOException {
77          return input.readInt() & 0xFFFFFFFFL;
78      }
79  
80      private int readUInt24(DataInputStream input) throws IOException {
81          int uint = input.read()<<16;
82          uint += input.read()<<8;
83          uint += input.read(); 
84          return uint;
85      }
86  
87      private Object readAMFData(DataInputStream input, int type)
88              throws IOException {
89          if (type == -1) {
90              type = input.readUnsignedByte();
91          }
92          switch (type) {
93          case 0:
94              return input.readDouble();
95          case 1:
96              return input.readUnsignedByte() == 1;
97          case 2:
98              return readAMFString(input);
99          case 3:
100             return readAMFObject(input);
101         case 8:
102             return readAMFEcmaArray(input);
103         case 10:
104             return readAMFStrictArray(input);
105         case 11:
106             final Date date = new Date((long) input.readDouble());
107             input.skip(2); // time zone
108             return date;
109         case 13:
110             return "UNDEFINED";
111         default:
112             return null;
113         }
114     }
115 
116     private Object readAMFStrictArray(DataInputStream input) throws IOException {
117         long count = readUInt32(input);
118         ArrayList<Object> list = new ArrayList<Object>();
119         for (int i = 0; i < count; i++) {
120             list.add(readAMFData(input, -1));
121         }
122         return list;
123     }
124 
125 
126     private String readAMFString(DataInputStream input) throws IOException {
127         int size = input.readUnsignedShort();
128         byte[] chars = new byte[size];
129         input.readFully(chars);
130         return new String(chars);
131     }
132 
133     private Object readAMFObject(DataInputStream input) throws IOException {
134         HashMap<String, Object> array = new HashMap<String, Object>();
135         while (true) {
136             String key = readAMFString(input);
137             int dataType = input.read();
138             if (dataType == 9) { // object end marker
139                 break;
140             }
141             array.put(key, readAMFData(input, dataType));
142         }
143         return array;
144     }
145 
146     private Object readAMFEcmaArray(DataInputStream input) throws IOException {
147         long size = readUInt32(input);
148         HashMap<String, Object> array = new HashMap<String, Object>();
149         for (int i = 0; i < size; i++) {
150             String key = readAMFString(input);
151             int dataType = input.read();
152             array.put(key, readAMFData(input, dataType));
153         }
154         return array;
155     }
156 
157     private boolean checkSignature(DataInputStream fis) throws IOException {
158         return fis.read() == 'F' && fis.read() == 'L' && fis.read() == 'V';
159     }
160 
161     public void parse(
162             InputStream stream, ContentHandler handler,
163             Metadata metadata, ParseContext context)
164             throws IOException, SAXException, TikaException {
165         DataInputStream datainput = new DataInputStream(stream);
166         if (!checkSignature(datainput)) {
167             throw new TikaException("FLV signature not detected");
168         }
169 
170         // header
171         int version = datainput.readUnsignedByte();
172         if (version != 1) {
173             // should be 1, perhaps this is not flv?
174             throw new TikaException("Unpexpected FLV version: " + version);
175         }
176 
177         int typeFlags = datainput.readUnsignedByte();
178 
179         long len = readUInt32(datainput);
180         if (len != 9) {
181             // we only know about format with header of 9 bytes
182             throw new TikaException("Unpexpected FLV header length: " + len);
183         }
184 
185         long sizePrev = readUInt32(datainput);
186         if (sizePrev != 0) {
187             // should be 0, perhaps this is not flv?
188             throw new TikaException(
189                     "Unpexpected FLV first previous block size: " + sizePrev);
190         }
191 
192         metadata.set(Metadata.CONTENT_TYPE, "video/x-flv");
193         metadata.set("hasVideo", Boolean.toString((typeFlags & MASK_VIDEO) != 0));
194         metadata.set("hasAudio", Boolean.toString((typeFlags & MASK_AUDIO) != 0));
195 
196         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
197         xhtml.startDocument();
198 
199         // flv tag stream follows...
200         while (true) {
201             int type = datainput.read();
202             if (type == -1) {
203                 // EOF
204                 break;
205             }
206 
207             int datalen = readUInt24(datainput); //body length
208             stream.skip(4); // timestamp
209             stream.skip(3); // streamid
210 
211             if (type == TYPE_METADATA) {
212                 // found metadata Tag, read content to buffer
213                 byte[] metaBytes = new byte[datalen];
214                 for (int readCount = 0; readCount < datalen;) {
215                     int r = stream.read(metaBytes, readCount, datalen - readCount);
216                     if(r!=-1) {
217                         readCount += r;
218 
219                     } else {
220                         break;
221                     }
222                 }
223 
224                 ByteArrayInputStream is = new ByteArrayInputStream(metaBytes);
225 
226                 DataInputStream dis = new DataInputStream(is);
227 
228                 Object data = null;
229 
230                 for (int i = 0; i < 2; i++) {
231                     data = readAMFData(dis, -1);
232                 }
233 
234                 if (data instanceof Map) {
235                     // TODO if there are multiple metadata values with same key (in
236                     // separate AMF blocks, we currently loose previous values)
237                     Map<String, Object> extractedMetadata = (Map<String, Object>) data;
238                     for (Entry<String, Object> entry : extractedMetadata.entrySet()) {
239                         metadata.set(entry.getKey(), entry.getValue().toString());
240                     }
241                 }
242 
243             } else {
244                 // Tag was not metadata, skip over data we cannot handle
245                 for (int skiplen = 0; skiplen < datalen;) {
246                     long currentSkipLen = datainput.skip(datalen - skiplen);
247                     skiplen += currentSkipLen;
248                 }
249             }
250 
251             sizePrev = readUInt32(datainput); // previous block size
252             if (sizePrev != datalen + 11) {
253                 // file was corrupt or we could not parse it...
254                 break;
255             }
256         }
257 
258         xhtml.endDocument();
259     }
260 
261     public void parse(InputStream stream, ContentHandler handler,
262             Metadata metadata) throws IOException, SAXException, TikaException {
263         parse(stream, handler, metadata, new ParseContext());
264     }
265 
266 }