https://developers.google.com/webmasters/ajax-crawling/docs/specification
? More specifically, do all your history tokens start with an
exclamation mark?
On Fri, Mar 30, 2012 at 9:31 AM, erebrus <erebrus@gmail.com> wrote:
> Hi all,
> I was reading https://developers.google.com/webmasters/ajax-crawling/
> on how to make ajax apps (consequently gwt apps) crawlable.
> I took the code from google (summarized in point 3 of "How to create
> an HTML snapshot?" to create a filter (that returns html from ajax
> using HtmlUnit) and changed the web.xml accordingly. I created a new
> GWT project with example code and applied the filter and the web.xml
> there. It worked directly.
> However, I did exactly the same on the gwt app I want to make
> searchable and it doesn't work. For some reason, the only requests the
> filter gets are the ones to the ones for the rpc.
> I think I must be missing a terribly simple detail, but I'm a bit lost
> on where to go from here.
>
>
> Following you can see the code for the filter (CrawlServlet) and the
> web.xml
>
> package crawltest.server;
>
> import com.gargoylesoftware.htmlunit.BrowserVersion;
> import com.gargoylesoftware.htmlunit.WebClient;
> import com.gargoylesoftware.htmlunit.html.HtmlPage;
>
> import java.io.IOException;
> import java.io.PrintWriter;
> import java.io.UnsupportedEncodingException;
> import java.net.URLDecoder;
> import java.util.logging.Logger;
>
> import javax.servlet.Filter;
> import javax.servlet.FilterChain;
> import javax.servlet.FilterConfig;
> import javax.servlet.ServletException;
> import javax.servlet.ServletRequest;
> import javax.servlet.ServletResponse;
> import javax.servlet.http.HttpServletRequest;
> import javax.servlet.http.HttpServletResponse;
>
> /**
> * Servlet that makes this application crawlable
> */
> public final class CrawlServlet implements Filter {
>
> private static final Logger logger =
> Logger.getLogger(CrawlServlet.class
> .getName());
> private static String rewriteQueryString(String queryString) throws
> UnsupportedEncodingException {
> StringBuilder queryStringSb = new StringBuilder(queryString);
> int i = queryStringSb.indexOf("&_escaped_fragment_");
> if (i != -1) {
> StringBuilder tmpSb = new
> StringBuilder(queryStringSb.substring(0, i));
> tmpSb.append("#!");
> tmpSb.append(URLDecoder.decode(queryStringSb.substring(i + 20,
> queryStringSb.length()),"UTF-8"));
> queryStringSb = tmpSb;
> }
>
> i = queryStringSb.indexOf("_escaped_fragment_");
> if (i != -1) {
> StringBuilder tmpSb = new
> StringBuilder(queryStringSb.substring(0, i));
> tmpSb.append("#!");
> tmpSb.append(URLDecoder.decode(queryStringSb.substring(i + 19,
> queryStringSb.length()), "UTF-8"));
> queryStringSb = tmpSb;
> }
> if (queryStringSb.indexOf("#!") != 0) {
> queryStringSb.insert(0, '?');
> }
> queryString = queryStringSb.toString();
>
>
>
> return queryString;
> }
>
> private FilterConfig filterConfig = null;
>
> /**
> * Destroys the filter configuration
> */
> public void destroy() {
> this.filterConfig = null;
> }
>
> /**
> * Filters all requests and invokes headless browser if necessary
> */
> public void doFilter(ServletRequest request, ServletResponse
> response,
> FilterChain chain) throws IOException {
> System.out.println("crawl");
> if (filterConfig == null) {
> return;
> }
> System.out.println("crawl");
> HttpServletRequest req = (HttpServletRequest) request;
> HttpServletResponse res = (HttpServletResponse) response;
> String queryString = req.getQueryString();
> System.out.println("query:"+queryString);
> System.out.println("param:"+req.getParameterMap().toString());
> System.out.println("req:"+req);
> if ((queryString != null) &&
> (queryString.contains("_escaped_fragment_"))) {
> System.out.println("in!!");
> StringBuilder pageNameSb = new StringBuilder("http://");
> pageNameSb.append(req.getServerName());
> if (req.getServerPort() != 0) {
> pageNameSb.append(":");
> pageNameSb.append(req.getServerPort());
> }
> pageNameSb.append(req.getRequestURI());
> queryString = rewriteQueryString(queryString);
> pageNameSb.append(queryString);
>
> final WebClient webClient = new
> WebClient(BrowserVersion.FIREFOX_3);
> webClient.setJavaScriptEnabled(true);
> String pageName = pageNameSb.toString();
> HtmlPage page = webClient.getPage(pageName);
> webClient.waitForBackgroundJavaScriptStartingBefore(2000);
>
> res.setContentType("text/html;charset=UTF-8");
> PrintWriter out = res.getWriter();
> out.println("<hr>");
> out.println("<center><h3>You are viewing a non-interactive page
> that is intended for the crawler. You probably want to see this page:
> <a href=\""
> + pageName + "\">" + pageName + "</a></h3></center>");
> out.println("<hr>");
>
> out.println(page.asXml());
> webClient.closeAllWindows();
> out.close();
>
> } else {
> try {
> chain.doFilter(request, response);
> } catch (ServletException e) {
> e.printStackTrace();
> }
> }
> }
>
> /**
> * Initializes the filter configuration
> */
> public void init(FilterConfig filterConfig) {
> this.filterConfig = filterConfig;
> }
>
> }
>
>
> web-xml:
>
> <?xml version="1.0" encoding="UTF-8"?>
> <web-app xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
> xsi:schemaLocation="http://java.sun.com/xml/ns/javaee
> http://java.sun.com/xml/ns/javaee/web-app_2_5.xsd"
> version="2.5" xmlns="http://java.sun.com/xml/ns/javaee">
>
> <filter>
> <filter-name>CrawlServlet</filter-name>
> <filter-class>crawltest.server.CrawlServlet</filter-class>
> </filter>
>
> <filter-mapping>
> <filter-name>CrawlServlet</filter-name>
> <url-pattern>/*</url-pattern>
> </filter-mapping>
>
> <!-- Servlets -->
>
> <!-- Default page to serve -->
> <welcome-file-list>
> <welcome-file>CrawlTest.html</welcome-file>
> </welcome-file-list>
>
> </web-app>
>
>
>
> --
> You received this message because you are subscribed to the Google Groups "Google Web Toolkit" group.
> To post to this group, send email to google-web-toolkit@googlegroups.com.
> To unsubscribe from this group, send email to google-web-toolkit+unsubscribe@googlegroups.com.
> For more options, visit this group at http://groups.google.com/group/google-web-toolkit?hl=en.
>
--
You received this message because you are subscribed to the Google Groups "Google Web Toolkit" group.
To post to this group, send email to google-web-toolkit@googlegroups.com.
To unsubscribe from this group, send email to google-web-toolkit+unsubscribe@googlegroups.com.
For more options, visit this group at http://groups.google.com/group/google-web-toolkit?hl=en.
No comments:
Post a Comment